diff --git a/Makefile b/Makefile index 3604d1da0..7f0eca1ae 100644 --- a/Makefile +++ b/Makefile @@ -153,6 +153,7 @@ include dsp/mpeg/mpeg.mk # │ include dsp/dsp.mk # │ include third_party/zlib/gz/gz.mk # │ include third_party/musl/musl.mk # │ +include third_party/intel/intel.mk # │ include libc/libc.mk #─┘ include libc/sock/sock.mk #─┐ include dsp/tty/tty.mk # ├──ONLINE RUNTIME diff --git a/dsp/tty/internal.h b/dsp/tty/internal.h index 92ad706f8..a6dc83cfb 100644 --- a/dsp/tty/internal.h +++ b/dsp/tty/internal.h @@ -1,7 +1,7 @@ #ifndef COSMOPOLITAN_DSP_TTY_INTERNAL_H_ #define COSMOPOLITAN_DSP_TTY_INTERNAL_H_ #include "dsp/tty/ttyrgb.h" -#include "libc/intrin/xmmintrin.internal.h" +#include "third_party/intel/xmmintrin.internal.h" #if !(__ASSEMBLER__ + __LINKER__ + 0) COSMOPOLITAN_C_START_ diff --git a/dsp/tty/quant.h b/dsp/tty/quant.h index 37cb354b9..10bcbb95c 100644 --- a/dsp/tty/quant.h +++ b/dsp/tty/quant.h @@ -3,9 +3,9 @@ #include "dsp/tty/ttyrgb.h" #include "libc/assert.h" #include "libc/intrin/bits.h" -#include "libc/intrin/xmmintrin.internal.h" #include "libc/limits.h" #include "libc/str/str.h" +#include "third_party/intel/xmmintrin.internal.h" #if !(__ASSEMBLER__ + __LINKER__ + 0) COSMOPOLITAN_C_START_ diff --git a/dsp/tty/rgb2ttyf2i.c b/dsp/tty/rgb2ttyf2i.c index 146a04128..2f5c70d2e 100644 --- a/dsp/tty/rgb2ttyf2i.c +++ b/dsp/tty/rgb2ttyf2i.c @@ -17,7 +17,7 @@ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "dsp/tty/quant.h" -#include "libc/intrin/xmmintrin.internal.h" +#include "third_party/intel/xmmintrin.internal.h" struct TtyRgb rgb2ttyf2i_(__m128 rgb) { __v4si i4; diff --git a/libc/intrin/avx2intrin.internal.h b/libc/intrin/avx2intrin.internal.h deleted file mode 100644 index cffe833a1..000000000 --- a/libc/intrin/avx2intrin.internal.h +++ /dev/null @@ -1,133 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_BITS_AVX2INTRIN_H_ -#define COSMOPOLITAN_LIBC_BITS_AVX2INTRIN_H_ -#include "libc/intrin/avxintrin.internal.h" -#if !(__ASSEMBLER__ + __LINKER__ + 0) -COSMOPOLITAN_C_START_ - -#define _mm256_min_epi16(M256_0, M256_1) \ - ((__m256i)__builtin_ia32_minps((__v16hi)(M256_0), (__v16hi)(M256_1))) - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » avx2 » simd ops ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm256_add_ps(M256_0, M256_1) \ - ((__m256)((__v8sf)(M256_0) + (__v8sf)(M256_1))) -#define _mm256_sub_ps(M256_0, M256_1) \ - ((__m256)((__v8sf)(M256_0) - (__v8sf)(M256_1))) -#define _mm256_mul_ps(M256_0, M256_1) \ - ((__m256)((__v8sf)(M256_0) * (__v8sf)(M256_1))) -#define _mm256_div_ps(M256_0, M256_1) \ - ((__m256)((__v8sf)(M256_0) / (__v8sf)(M256_1))) -#define _mm256_and_ps(M256_0, M256_1) \ - ((__m256)((__v8su)(M256_0) & (__v8su)(M256_1))) -#define _mm256_or_ps(M256_0, M256_1) \ - ((__m256)((__v8su)(M256_0) | (__v8su)(M256_1))) -#define _mm256_xor_ps(M256_0, M256_1) /* XORPD [u32 simd xor] */ \ - ((__m256)((__v8su)(M256_0) ^ (__v8su)(M256_1))) -#define _mm256_andnot_ps(M256_0, M256_1) /* ANDNPS [u32 simd nand] */ \ - ((__m256)(~(__v8su)(M256_0) & (__v8su)(M256_1))) -#define _mm256_rcp_ps(M256) __builtin_ia32_rcpps256((__v8sf)(M256)) -#define _mm256_sqrt_ps(M256) __builtin_ia32_sqrtps256((__v8sf)(M256)) -#define _mm256_rsqrt_ps(M256) __builtin_ia32_rsqrtps256((__v8sf)(M256)) -#define _mm256_round_ps(M256, IMM) \ - ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(M256), IMM)) - -#define _mm256_add_epi32(M256I_0, M256I_1) \ - ((__m256i)((__v8su)(M256I_0) + (__v8su)(M256I_1))) -#define _mm256_cmpgt_epi32(M256I_0, M256I_1) \ - ((__m256i)((__v8si)(M256I_0) > (__v8si)(M256I_1))) -#define _mm256_min_epi32(M256I_0, M256I_1) \ - ((__m256i)__builtin_ia32_pminsd256((__v8si)(M256I_0), (__v8si)(M256I_1))) -#define _mm256_min_epu32(M256I_0, M256I_1) \ - ((__m256i)__builtin_ia32_pminud256((__v8si)(M256I_0), (__v8si)(M256I_1))) -#define _mm256_max_epi32(M256I_0, M256I_1) \ - ((__m256i)__builtin_ia32_pmaxsd256((__v8si)(M256I_0), (__v8si)(M256I_1))) -#define _mm256_max_epu32(M256I_0, M256I_1) \ - ((__m256i)__builtin_ia32_pmaxud256((__v8si)(M256I_0), (__v8si)(M256I_1))) -#define _mm256_blendv_epi8(M256I_0, M256I_1, M256I_2) \ - ((__m256i)__builtin_ia32_pblendvb256((__v32qi)(M256I_0), (__v32qi)(M256I_1), \ - (__v32qi)(M256I_2))) - -#define _mm256_min_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_minps256((__v8sf)(__m256)(M256_0), \ - (__v8sf)(__m256)(M256_1))) -#define _mm256_max_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_maxps256((__v8sf)(__m256)(M256_0), \ - (__v8sf)(__m256)(M256_1))) -#define _mm256_cmpneq_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpneqps((__v8sf)(__m256)(M256_0), \ - (__v8sf)(__m256)(M256_1))) -#define _mm256_cmplt_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpltps((__v8sf)(__m256)(M256_0), \ - (__v8sf)(__m256)(M256_1))) -#define _mm256_cmpnlt_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpnltps((__v8sf)(__m256)(M256_0), \ - (__v8sf)(__m256)(M256_1))) -#define _mm256_cmple_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpleps((__v8sf)(__m256)(M256_0), \ - (__v8sf)(__m256)(M256_1))) -#define _mm256_cmpnle_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpnleps((__v8sf)(__m256)(M256_0), \ - (__v8sf)(__m256)(M256_1))) -#define _mm256_cmpgt_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpltps((__v8sf)(__m256)(M256_1), \ - (__v8sf)(__m256)(M256_0))) -#define _mm256_cmpngt_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpnltps((__v8sf)(__m256)(M256_1), \ - (__v8sf)(__m256)(M256_0))) -#define _mm256_cmpge_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpleps((__v8sf)(__m256)(M256_1), \ - (__v8sf)(__m256)(M256_0))) -#define _mm256_cmpnge_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpnleps((__v8sf)(__m256)(M256_1), \ - (__v8sf)(__m256)(M256_0))) -#define _mm256_cmpord_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpordps((__v8sf)(__m256)(M256_0), \ - (__v8sf)(__m256)(M256_1))) -#define _mm256_cmpunord_ps(M256_0, M256_1) \ - ((__m256)__builtin_ia32_cmpunordps((__v8sf)(__m256)(M256_0), \ - (__v8sf)(__m256)(M256_1))) - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § avx2 » memory ops ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -struct thatispacked PackedMayaliasIntyYmm { - __m256i Ymm; -} mayalias; - -#define _mm256_set_ps(FLT_0, FLT_1, FLT_2, FLT_3, FLT_4, FLT_5, FLT_6, FLT_7) \ - ((__m256)(__v8sf){(float)(FLT_0), (float)(FLT_1), (float)(FLT_2), \ - (float)(FLT_3), (float)(FLT_4), (float)(FLT_5), \ - (float)(FLT_6), (float)(FLT_7)}) -#define _mm256_set1_ps(FLT_0) \ - _mm256_set_ps(FLT_0, FLT_0, FLT_0, FLT_0, FLT_0, FLT_0, FLT_0, FLT_0) -#define _mm256_setr_ps(FLT_0, FLT_1, FLT_2, FLT_3, FLT_4, FLT_5, FLT_6, FLT_7) \ - _mm256_set_ps(FLT_7, FLT_6, FLT_5, FLT_4, FLT_3, FLT_2, FLT_1, FLT_0) - -#define _mm256_set_epi32(INT_0, INT_1, INT_2, INT_3, INT_4, INT_5, INT_6, \ - INT_7) \ - ((__m256i)(__v8si){(int)(INT_0), (int)(INT_1), (int)(INT_2), (int)(INT_3), \ - (int)(INT_4), (int)(INT_5), (int)(INT_6), (int)(INT_7)}) -#define _mm256_set1_epi32(INT_0) \ - _mm256_set_epi32(INT_0, INT_0, INT_0, INT_0, INT_0, INT_0, INT_0, INT_0) -#define _mm256_setr_epi32(INT_0, INT_1, INT_2, INT_3, INT_4, INT_5, INT_6, \ - INT_7) \ - _mm256_set_epi32(INT_7, INT_6, INT_5, INT_4, INT_3, INT_2, INT_1, INT_0) - -#define _mm256_loadu_si256(M256IP_0) \ - ({ \ - const __m256i *Ymm = (M256IP_0); \ - ((struct PackedMayaliasIntyYmm *)Ymm)->Ymm; \ - }) - -#define _mm256_storeu_si256(M256IP_0, M256I_1) \ - ({ \ - __m256i *Ymm = (M256IP_0); \ - ((struct PackedMayaliasIntyYmm *)Ymm)->Ymm = M256I_1; \ - }) - -COSMOPOLITAN_C_END_ -#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ -#endif /* COSMOPOLITAN_LIBC_BITS_AVX2INTRIN_H_ */ diff --git a/libc/intrin/avxintrin.internal.h b/libc/intrin/avxintrin.internal.h deleted file mode 100644 index 4bca02207..000000000 --- a/libc/intrin/avxintrin.internal.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_BITS_AVXINTRIN_H_ -#define COSMOPOLITAN_LIBC_BITS_AVXINTRIN_H_ -#if !(__ASSEMBLER__ + __LINKER__ + 0) -COSMOPOLITAN_C_START_ - -typedef float __m256 _Vector_size(32) mayalias; -typedef double __m256d _Vector_size(32) mayalias; -typedef long long __m256i _Vector_size(32) mayalias; - -typedef float __m256_u _Vector_size(32) forcealign(1) mayalias; -typedef double __m256d_u _Vector_size(32) forcealign(1) mayalias; -typedef long long __m256i_u _Vector_size(32) forcealign(1) mayalias; - -typedef double __v4df _Vector_size(32); -typedef float __v8sf _Vector_size(32); -typedef long long __v4di _Vector_size(32); -typedef unsigned long long __v4du _Vector_size(32); -typedef int __v8si _Vector_size(32); -typedef unsigned __v8su _Vector_size(32); -typedef short __v16hi _Vector_size(32); -typedef unsigned short __v16hu _Vector_size(32); -typedef char __v32qi _Vector_size(32); -typedef unsigned char __v32qu _Vector_size(32); - -#define _mm256_setzero_ps() ((__m256)(__v8sf){0}) -#define _mm256_load_ps(FLOATPTR) (*(__m256 *)(FLOATPTR)) -#define _mm256_loadu_ps(FLOATPTR) (*(__m256_u *)(FLOATPTR)) -#define _mm256_store_ps(FLOATPTR, M256_0) \ - (*(__m256 *)(FLOATPTR) = (__m256)(M256_0)) -#define _mm256_storeu_ps(FLOATPTR, M256_0) \ - (*(__m256_u *)(FLOATPTR) = (__m256)(M256_0)) -#define _mm256_extractf128_ps(M256_0, INT_1) \ - ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(M256_0), \ - (int)(INT_1))) -#define _mm256_insertf128_ps(M256_0, M128_1, IMM_2) \ - ((__m256)__builtin_ia32_vinsertf128_ps256( \ - (__v8sf)(__m256)(M256_0), (__v4sf)(__m128)(M128_1), (int)(IMM_2))) - -#ifdef __llvm__ -#define _mm256_castps128_ps256(M128_0) \ - ((__m256)__builtin_shufflevector((__v4sf)(__m128)(M128_0), \ - (__v4sf)(__m128)(M128_0), 0, 1, 2, 3, -1, \ - -1, -1, -1)) -#else -#define _mm256_castps128_ps256(M128_0) \ - ((__m256)__builtin_ia32_ps256_ps((__v4sf)(__m128)(M128_0))) -#endif - -COSMOPOLITAN_C_END_ -#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ -#endif /* COSMOPOLITAN_LIBC_BITS_AVXINTRIN_H_ */ diff --git a/libc/intrin/emmintrin.internal.h b/libc/intrin/emmintrin.internal.h deleted file mode 100644 index 75e25b045..000000000 --- a/libc/intrin/emmintrin.internal.h +++ /dev/null @@ -1,244 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_ -#include "libc/intrin/xmmintrin.internal.h" -#if !(__ASSEMBLER__ + __LINKER__ + 0) - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse2 ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -typedef char __v16qi _Vector_size(16); -typedef unsigned char __v16qu _Vector_size(16); -typedef signed char __v16qs _Vector_size(16); - -typedef short __v8hi _Vector_size(16); -typedef unsigned short __v8hu _Vector_size(16); - -typedef double __v2df _Vector_size(16); -typedef double __m128d _Vector_size(16) forcealign(16); -typedef double __m128d_u _Vector_size(16) forcealign(1); - -typedef long long __v2di _Vector_size(16); -typedef long long __m128i _Vector_size(16) forcealign(16); -typedef long long __m128i_u _Vector_size(16) forcealign(1); -typedef unsigned long long __v2du _Vector_size(16); - -struct thatispacked mayalias __usi128ma { - __m128i_u __v; -}; - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse2 » memory ops ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_loadu_si128(M128IP) ((struct __usi128ma *)(M128IP))->__v -#define _mm_storeu_si128(M128IP, M128I) \ - (((struct __usi128ma *)(M128IP))->__v = (M128I)) - -#define _mm_set_epi8(I8_15, I8_14, I8_13, I8_12, I8_11, I8_10, I8_9, I8_8, \ - I8_7, I8_6, I8_5, I8_4, I8_3, I8_2, I8_1, I8_0) \ - ((__m128i)(__v16qi){I8_0, I8_1, I8_2, I8_3, I8_4, I8_5, I8_6, I8_7, I8_8, \ - I8_9, I8_10, I8_11, I8_12, I8_13, I8_14, I8_15}) -#define _mm_set_epi16(I16_7, I16_6, I16_5, I16_4, I16_3, I16_2, I16_1, I16_0) \ - ((__m128i)(__v8hi){I16_0, I16_1, I16_2, I16_3, I16_4, I16_5, I16_6, I16_7}) -#define _mm_set_epi32(I32_3, I32_2, I32_1, I32_0) \ - ((__m128i)(__v4si){I32_0, I32_1, I32_2, I32_3}) -#define _mm_set_epi64x(I64_1, I64_0) ((__m128i)(__v2di){I64_0, I64_1}) - -#define _mm_setr_epi8(I8_15, I8_14, I8_13, I8_12, I8_11, I8_10, I8_9, I8_8, \ - I8_7, I8_6, I8_5, I8_4, I8_3, I8_2, I8_1, I8_0) \ - _mm_set_epi8(I8_0, I8_1, I8_2, I8_3, I8_4, I8_5, I8_6, I8_7, I8_8, I8_9, \ - I8_10, I8_11, I8_12, I8_13, I8_14, I8_15) -#define _mm_setr_epi16(I16_7, I16_6, I16_5, I16_4, I16_3, I16_2, I16_1, I16_0) \ - _mm_set_epi16(I16_0, I16_1, I16_2, I16_3, I16_4, I16_5, I16_6, I16_7) -#define _mm_setr_epi32(I32_3, I32_2, I32_1, I32_0) \ - _mm_set_epi32(I32_0, I32_1, I32_2, I32_3) -#define _mm_setr_epi64x(I64_1, I64_0) _mm_set_epi64x(I64_0, I64_1) - -#define _mm_set1_epi8(I8) \ - _mm_set_epi8(I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8) -#define _mm_set1_epi16(I16) \ - _mm_set_epi16(I16, I16, I16, I16, I16, I16, I16, I16) -#define _mm_set1_epi32(I32) _mm_set_epi32(I32, I32, I32, I32) -#define _mm_set1_epi64x(I64) _mm_set_epi64x(I64, I64) - -#define _mm_cvtsi128_si32(M128I) ((__v4si)(M128I))[0] -#define _mm_cvtsi32_si128(I32) ((__m128i)(__v4si){(I32), 0, 0, 0}) -#define _mm_setzero_si128() ((__m128i)(__v2di){0LL, 0LL}) -#define _mm_castsi128_ps(M128I) ((__m128)(M128I)) -#define _mm_castps_si128(M128) ((__m128i)(M128)) -#define _mm_load_si128(M128I) (*(M128I)) - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse2 » simd ops ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_and_si128(M128I_0, M128I_1) \ - ((__m128i)((__v2du)(M128I_0) & (__v2du)(M128I_1))) -#define _mm_or_si128(M128I_0, M128I_1) \ - ((__m128i)((__v2du)(M128I_0) | (__v2du)(M128I_1))) -#define _mm_xor_si128(M128I_0, M128I_1) \ - ((__m128i)((__v2du)(M128I_0) ^ (__v2du)(M128I_1))) -#define _mm_andnot_si128(M128I_0, M128I_1) \ - ((__m128i)(~(__v2du)(M128I_0) & (__v2du)(M128I_1))) - -#define _mm_add_pd(M128D_0, M128D_1) \ - (__m128d)((__v2df)(M128D_0) + (__v2df)(M128D_1)) -#define _mm_sub_pd(M128D_0, M128D_1) \ - (__m128d)((__v2df)(M128D_0) - (__v2df)(M128D_1)) -#define _mm_mul_pd(M128D_0, M128D_1) \ - (__m128d)((__v2df)(M128D_0) * (__v2df)(M128D_1)) -#define _mm_div_pd(M128D_0, M128D_1) \ - (__m128d)((__v2df)(M128D_0) / (__v2df)(M128D_1)) -#define _mm_and_pd(M128D_0, M128D_1) \ - (__m128d)((__v2df)(M128D_0) & (__v2df)(M128D_1)) -#define _mm_or_pd(M128D_0, M128D_1) \ - (__m128d)((__v2df)(M128D_0) | (__v2df)(M128D_1)) -#define _mm_xor_pd(M128D_0, M128D_1) \ - (__m128d)((__v2df)(M128D_0) ^ (__v2df)(M128D_1)) -#define _mm_andnot_pd(M128D_0, M128D_1) \ - (__m128d)(~(__v2df)(M128D_0) & (__v2df)(M128D_1)) -#define _mm_sqrt_pd(M128D) __builtin_ia32_sqrtpd((__v2df)(M128D)) - -#define _mm_min_pd(M128D_0, M128D_1) \ - __builtin_ia32_minpd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_max_pd(M128D_0, M128D_1) \ - __builtin_ia32_maxpd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpeq_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmpeqpd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpneq_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmpneqpd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmplt_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmpltpd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpnlt_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmpnltpd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmple_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmplepd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpnle_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmpnlepd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpgt_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmpltpd((__v2df)(M128D_1), (__v2df)(M128D_0)) -#define _mm_cmpngt_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmpnltpd((__v2df)(M128D_1), (__v2df)(M128D_0)) -#define _mm_cmpge_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmplepd((__v2df)(M128D_1), (__v2df)(M128D_0)) -#define _mm_cmpnge_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmpnlepd((__v2df)(M128D_1), (__v2df)(M128D_0)) -#define _mm_cmpord_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmpordpd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpunord_pd(M128D_0, M128D_1) \ - __builtin_ia32_cmpunordpd((__v2df)(M128D_0), (__v2df)(M128D_1)) - -#define _mm_sad_epu8(M128I_0, M128I_1) \ - __builtin_ia32_psadbw128((__v16qi)(M128I_0), (__v16qi)(M128I_1)) - -#define _mm_subs_epi8(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_psubsb128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) -#define _mm_subs_epu8(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_psubusw128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) -#define _mm_subs_epi16(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_psubsw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) -#define _mm_subs_epu16(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_psubusw128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) - -#define _mm_add_epi32(M128I_0, M128I_1) \ - ((__m128i)((__v4su)(M128I_0) + (__v4su)(M128I_1))) -#define _mm_sub_epi32(M128I_0, M128I_1) \ - ((__m128i)((__v4su)(M128I_0) - (__v4su)(M128I_1))) -#define _mm_madd_epi16(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_pmaddwd128((__v8hi)(M128I_0), (__v8hi)(M128I_1))) -#define _mm_shuffle_epi32(V, IMM) \ - ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(V), (int)(IMM))) - -#define _mm_slli_epi32(M128I, COUNT) \ - ((__m128i)__builtin_ia32_pslldi128((__v4si)(M128I), (COUNT))) - -#define _mm_slli_si128(M128I, IMM) \ - ((__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(M128I), (int)(IMM)*8)) -#define _mm_srli_si128(M128I, IMM) \ - ((__m128i)__builtin_ia32_psrldqi128((__v2di)(__m128i)(M128I), (int)(IMM)*8)) - -#define _mm_cmpeq_epi8(a, b) ((__m128i)((__v16qi)(a) == (__v16qi)(b))) -#define _mm_movemask_epi8(a) __builtin_ia32_pmovmskb128((__v16qi)(a)) - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse2 » scalar ops ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_sqrt_sd(M128D_0, M128D_1) \ - ({ \ - __m128d M128d2 = __builtin_ia32_sqrtsd((__v2df)(M128D_1)); \ - (__m128d){M128d2[0], (M128D_0)[1]}; \ - }) - -#define _mm_add_sd(M128D_0, M128D_1) \ - ({ \ - (M128D_0)[0] += (M128D_1)[0]; \ - (M128D_0); \ - }) -#define _mm_sub_sd(M128D_0, M128D_1) \ - ({ \ - (M128D_0)[0] -= (M128D_1)[0]; \ - (M128D_0); \ - }) -#define _mm_mul_sd(M128D_0, M128D_1) \ - ({ \ - (M128D_0)[0] *= (M128D_1)[0]; \ - (M128D_0); \ - }) -#define _mm_div_sd(M128D_0, M128D_1) \ - ({ \ - (M128D_0)[0] /= (M128D_1)[0]; \ - (M128D_0); \ - }) - -#define _mm_min_sd(M128D_0, M128D_1) \ - __builtin_ia32_minsd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_max_sd(M128D_0, M128D_1) \ - __builtin_ia32_maxsd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpeq_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmpeqsd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpneq_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmpneqsd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmplt_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmpltsd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpnlt_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmpnltsd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmple_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmplesd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpnle_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmpnlesd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpgt_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmpltsd((__v2df)(M128D_1), (__v2df)(M128D_0)) -#define _mm_cmpngt_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmpnltsd((__v2df)(M128D_1), (__v2df)(M128D_0)) -#define _mm_cmpge_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmplesd((__v2df)(M128D_1), (__v2df)(M128D_0)) -#define _mm_cmpnge_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmpnlesd((__v2df)(M128D_1), (__v2df)(M128D_0)) -#define _mm_cmpord_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmpordsd((__v2df)(M128D_0), (__v2df)(M128D_1)) -#define _mm_cmpunord_sd(M128D_0, M128D_1) \ - __builtin_ia32_cmpunordsd((__v2df)(M128D_0), (__v2df)(M128D_1)) - -#define _mm_SSE2(op, A, B) \ - ({ \ - __m128i R = A; \ - asm(#op " %1, %0" : "+x"(R) : "xm"(B)); \ - R; \ - }) -#define _mm_mul_epu32(A, B) _mm_SSE2(pmuludq, A, B) -#define _mm_add_epi64(A, B) _mm_SSE2(paddq, A, B) -#define _mm_srli_epi64(A, B) _mm_SSE2(psrlq, A, B) -#define _mm_slli_epi64(A, B) _mm_SSE2(psllq, A, B) -#define _mm_unpacklo_epi64(A, B) _mm_SSE2(punpcklqdq, A, B) -#define _mm_unpackhi_epi64(A, B) _mm_SSE2(punpckhqdq, A, B) - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse2 » miscellaneous ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_pause() asm("rep nop") - -#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ -#endif /* COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_ */ diff --git a/libc/intrin/pmmintrin.internal.h b/libc/intrin/pmmintrin.internal.h deleted file mode 100644 index 715bb92ae..000000000 --- a/libc/intrin/pmmintrin.internal.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_ -#if !(__ASSEMBLER__ + __LINKER__ + 0) - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse3 ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_hadd_ps(M128_0, M128_1) \ - ((__m128)__builtin_ia32_haddps((__v4sf)(__m128)(M128_0), \ - (__v4sf)(__m128)(M128_0))) - -#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ -#endif /* COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_ */ diff --git a/libc/intrin/shaintrin.internal.h b/libc/intrin/shaintrin.internal.h deleted file mode 100644 index c2e02117a..000000000 --- a/libc/intrin/shaintrin.internal.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_BITS_SHAINTRIN_H_ -#define COSMOPOLITAN_LIBC_BITS_SHAINTRIN_H_ -#include "libc/intrin/emmintrin.internal.h" -#include "libc/intrin/xmmintrin.internal.h" -#if !(__ASSEMBLER__ + __LINKER__ + 0) - -#define _mm_sha1rnds4_epu32(M128I_0, M128I_1, MEM) \ - __builtin_ia32_sha1rnds4((__v4si)(__m128i)(M128I_0), \ - (__v4si)(__m128i)(M128I_1), (MEM)) - -#define _mm_sha1nexte_epu32(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_sha1nexte((__v4si)(__m128i)(M128I_0), \ - (__v4si)(__m128i)(M128I_1))) - -#define _mm_sha1msg1_epu32(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_sha1msg1((__v4si)(__m128i)(M128I_0), \ - (__v4si)(__m128i)(M128I_1))) - -#define _mm_sha1msg2_epu32(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_sha1msg2((__v4si)(__m128i)(M128I_0), \ - (__v4si)(__m128i)(M128I_1))) - -#define _mm_sha256rnds2_epu32(M128I_0, M128I_1, M128I_2) \ - ((__m128i)__builtin_ia32_sha256rnds2((__v4si)(__m128i)(M128I_0), \ - (__v4si)(__m128i)(M128I_1), \ - (__v4si)(__m128i)(M128I_2))) - -#define _mm_sha256msg1_epu32(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_sha256msg1((__v4si)(__m128i)(M128I_0), \ - (__v4si)(__m128i)(M128I_1))) - -#define _mm_sha256msg2_epu32(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_sha256msg2((__v4si)(__m128i)(M128I_0), \ - (__v4si)(__m128i)(M128I_1))) - -#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ -#endif /* COSMOPOLITAN_LIBC_BITS_SHAINTRIN_H_ */ diff --git a/libc/intrin/smmintrin.internal.h b/libc/intrin/smmintrin.internal.h deleted file mode 100644 index 2d4701351..000000000 --- a/libc/intrin/smmintrin.internal.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_BITS_SMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_BITS_SMMINTRIN_H_ - -/** - * @fileoverview SSE4 intrinsics. - */ - -#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_CUR_DIRECTION 4 -#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) -#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_NO_EXC 8 -#define _MM_FROUND_RAISE_EXC 0 -#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) -#define _MM_FROUND_TO_NEAREST_INT 0 -#define _MM_FROUND_TO_NEG_INF 1 -#define _MM_FROUND_TO_POS_INF 2 -#define _MM_FROUND_TO_ZERO 3 -#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) - -#if !(__ASSEMBLER__ + __LINKER__ + 0) - -#define _mm_extract_epi32(M128I, I32) \ - ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(M128I), (int)(I32))) - -#define _mm_minpos_epu16(M128I) \ - ((int)__builtin_ia32_phminposuw128((__v4si)(__m128i)(M128I), (int)(I32))) - -#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ -#endif /* COSMOPOLITAN_LIBC_BITS_SMMINTRIN_H_ */ diff --git a/libc/intrin/tmmintrin.internal.h b/libc/intrin/tmmintrin.internal.h deleted file mode 100644 index 58e3ce657..000000000 --- a/libc/intrin/tmmintrin.internal.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_BITS_TMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_BITS_TMMINTRIN_H_ -#include "libc/intrin/emmintrin.internal.h" -#if !(__ASSEMBLER__ + __LINKER__ + 0) - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » ssse3 ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_maddubs_epi16(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_pmaddubsw128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) - -#define _mm_shuffle_epi8(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_pshufb128((__v16qi)(M128I_0), (__v16qi)(M128I_1))) - -#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ -#endif /* COSMOPOLITAN_LIBC_BITS_TMMINTRIN_H_ */ diff --git a/libc/intrin/wmmintrin.internal.h b/libc/intrin/wmmintrin.internal.h deleted file mode 100644 index 544b6ef8d..000000000 --- a/libc/intrin/wmmintrin.internal.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_BITS_WMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_BITS_WMMINTRIN_H_ -#include "libc/intrin/emmintrin.internal.h" -#if !(__ASSEMBLER__ + __LINKER__ + 0) - -#define _mm_clmulepi64_si128(X, Y, IMM) \ - ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \ - (__v2di)(__m128i)(Y), (char)(IMM))) - -#define _mm_aesenc_si128(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_aesenc128((__v2di)(M128I_0), (__v2di)(M128I_1))) -#define _mm_aesenclast_si128(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_aesenclast128((__v2di)(M128I_0), (__v2di)(M128I_1))) - -#define _mm_aesdec_si128(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_aesdec128((__v2di)(M128I_0), (__v2di)(M128I_1))) -#define _mm_aesdeclast_si128(M128I_0, M128I_1) \ - ((__m128i)__builtin_ia32_aesdeclast128((__v2di)(M128I_0), (__v2di)(M128I_1))) - -#define _mm_aesimc_si128(M128I) \ - ((__m128i)__builtin_ia32_aesimc128((__v2di)(M128I))) -#define _mm_aesimclast_si128(M128I) \ - ((__m128i)__builtin_ia32_aesimclast128((__v2di)(M128I))) - -#define _mm_aeskeygenassist_si128(X, Y) \ - ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(X), (int)(Y))) - -#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ -#endif /* COSMOPOLITAN_LIBC_BITS_WMMINTRIN_H_ */ diff --git a/libc/intrin/xmmintrin.internal.h b/libc/intrin/xmmintrin.internal.h deleted file mode 100644 index 27b46b617..000000000 --- a/libc/intrin/xmmintrin.internal.h +++ /dev/null @@ -1,243 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_BITS_XMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_BITS_XMMINTRIN_H_ -#include "libc/dce.h" -#include "libc/intrin/emmintrin.internal.h" - -#define _MM_EXCEPT_MASK 0x003f -#define _MM_EXCEPT_INVALID 0x0001 -#define _MM_EXCEPT_DENORM 0x0002 -#define _MM_EXCEPT_DIV_ZERO 0x0004 -#define _MM_EXCEPT_OVERFLOW 0x0008 -#define _MM_EXCEPT_UNDERFLOW 0x0010 -#define _MM_EXCEPT_INEXACT 0x0020 -#define _MM_MASK_MASK 0x1f80 -#define _MM_MASK_INVALID 0x0080 -#define _MM_MASK_DENORM 0x0100 -#define _MM_MASK_DIV_ZERO 0x0200 -#define _MM_MASK_OVERFLOW 0x0400 -#define _MM_MASK_UNDERFLOW 0x0800 -#define _MM_MASK_INEXACT 0x1000 -#define _MM_ROUND_MASK 0x6000 -#define _MM_ROUND_NEAREST 0x0000 -#define _MM_ROUND_DOWN 0x2000 -#define _MM_ROUND_UP 0x4000 -#define _MM_ROUND_TOWARD_ZERO 0x6000 -#define _MM_FLUSH_ZERO_MASK 0x8000 -#define _MM_FLUSH_ZERO_ON 0x8000 -#define _MM_FLUSH_ZERO_OFF 0x0000 - -#define _MM_SHUFFLE(A, B, C, D) (((A) << 6) | ((B) << 4) | ((C) << 2) | (D)) - -#if !(__ASSEMBLER__ + __LINKER__ + 0) - -typedef int __v4si _Vector_size(16); -typedef unsigned int __v4su _Vector_size(16); -typedef float __v4sf _Vector_size(16); -typedef float __m128 _Vector_size(16) forcealign(16) mayalias; -typedef float __m128_u _Vector_size(16) forcealign(1) mayalias; - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse » simd ops ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_add_ps(M128_0, M128_1) \ - ((__m128)((__v4sf)(M128_0) + (__v4sf)(M128_1))) -#define _mm_sub_ps(M128_0, M128_1) \ - ((__m128)((__v4sf)(M128_0) - (__v4sf)(M128_1))) -#define _mm_mul_ps(M128_0, M128_1) \ - ((__m128)((__v4sf)(M128_0) * (__v4sf)(M128_1))) -#define _mm_div_ps(M128_0, M128_1) \ - ((__m128)((__v4sf)(M128_0) / (__v4sf)(M128_1))) -#define _mm_and_ps(M128_0, M128_1) \ - ((__m128)((__v4su)(M128_0) & (__v4su)(M128_1))) -#define _mm_or_ps(M128_0, M128_1) \ - ((__m128)((__v4su)(M128_0) | (__v4su)(M128_1))) -#define _mm_xor_ps(M128_0, M128_1) /* XORPD [u32 simd xor] */ \ - ((__m128)((__v4su)(M128_0) ^ (__v4su)(M128_1))) -#define _mm_andnot_ps(M128_0, M128_1) /* ANDNPS [u32 simd nand] */ \ - ((__m128)(~(__v4su)(M128_0) & (__v4su)(M128_1))) -#define _mm_rcp_ps(M128) __builtin_ia32_rcpps((__v4sf)(M128)) -#define _mm_sqrt_ps(M128) __builtin_ia32_sqrtps((__v4sf)(M128)) -#define _mm_rsqrt_ps(M128) __builtin_ia32_rsqrtps((__v4sf)(M128)) - -#define _mm_min_ps(M128_0, M128_1) \ - __builtin_ia32_minps((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_max_ps(M128_0, M128_1) \ - __builtin_ia32_maxps((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_min_ss(M128_0, M128_1) \ - __builtin_ia32_minss((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_max_ss(M128_0, M128_1) \ - __builtin_ia32_maxss((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpeq_ps(M128_0, M128_1) \ - __builtin_ia32_cmpeqps((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpneq_ps(M128_0, M128_1) \ - __builtin_ia32_cmpneqps((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmplt_ps(M128_0, M128_1) \ - __builtin_ia32_cmpltps((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpnlt_ps(M128_0, M128_1) \ - __builtin_ia32_cmpnltps((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmple_ps(M128_0, M128_1) \ - __builtin_ia32_cmpleps((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpnle_ps(M128_0, M128_1) \ - __builtin_ia32_cmpnleps((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpgt_ps(M128_0, M128_1) \ - __builtin_ia32_cmpltps((__v4sf)(M128_1), (__v4sf)(M128_0)) -#define _mm_cmpngt_ps(M128_0, M128_1) \ - __builtin_ia32_cmpnltps((__v4sf)(M128_1), (__v4sf)(M128_0)) -#define _mm_cmpge_ps(M128_0, M128_1) \ - __builtin_ia32_cmpleps((__v4sf)(M128_1), (__v4sf)(M128_0)) -#define _mm_cmpnge_ps(M128_0, M128_1) \ - __builtin_ia32_cmpnleps((__v4sf)(M128_1), (__v4sf)(M128_0)) -#define _mm_cmpord_ps(M128_0, M128_1) \ - __builtin_ia32_cmpordps((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpunord_ps(M128_0, M128_1) \ - __builtin_ia32_cmpunordps((__v4sf)(M128_0), (__v4sf)(M128_1)) - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse » scalar ops ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_add_ss(m128_0, m128_1) \ - ({ \ - __m128 a = m128_0; \ - __m128 b = m128_1; \ - a[0] += b[0]; \ - a; \ - }) - -#define _mm_sub_ss(m128_0, m128_1) \ - ({ \ - __m128 a = m128_0; \ - __m128 b = m128_1; \ - a[0] -= b[0]; \ - a; \ - }) - -#define _mm_mul_ss(m128_0, m128_1) \ - ({ \ - __m128 a = m128_0; \ - __m128 b = m128_1; \ - a[0] *= b[0]; \ - a; \ - }) - -#define _mm_div_ss(m128_0, m128_1) \ - ({ \ - __m128 a = m128_0; \ - __m128 b = m128_1; \ - a[0] /= b[0]; \ - a; \ - }) - -#define _mm_rcp_ss(M128) __builtin_ia32_rcpss((__v4sf)(M128)) /*~1/x*/ -#define _mm_sqrt_ss(M128) __builtin_ia32_sqrtss((__v4sf)(M128)) /*sqrt𝑥*/ -#define _mm_rsqrt_ss(M128) __builtin_ia32_rsqrtss((__v4sf)(M128)) /*~1/sqrt𝑥*/ - -#define _mm_min_ss(M128_0, M128_1) \ - __builtin_ia32_minss((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_max_ss(M128_0, M128_1) \ - __builtin_ia32_maxss((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpeq_ss(M128_0, M128_1) \ - __builtin_ia32_cmpeqss((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpneq_ss(M128_0, M128_1) \ - __builtin_ia32_cmpneqss((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmplt_ss(M128_0, M128_1) \ - __builtin_ia32_cmpltss((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpnlt_ss(M128_0, M128_1) \ - __builtin_ia32_cmpnltss((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmple_ss(M128_0, M128_1) \ - __builtin_ia32_cmpless((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpnle_ss(M128_0, M128_1) \ - __builtin_ia32_cmpnless((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpgt_ss(M128_0, M128_1) \ - __builtin_ia32_cmpltss((__v4sf)(M128_1), (__v4sf)(M128_0)) -#define _mm_cmpngt_ss(M128_0, M128_1) \ - __builtin_ia32_cmpnltss((__v4sf)(M128_1), (__v4sf)(M128_0)) -#define _mm_cmpge_ss(M128_0, M128_1) \ - __builtin_ia32_cmpless((__v4sf)(M128_1), (__v4sf)(M128_0)) -#define _mm_cmpnge_ss(M128_0, M128_1) \ - __builtin_ia32_cmpnless((__v4sf)(M128_1), (__v4sf)(M128_0)) -#define _mm_cmpord_ss(M128_0, M128_1) \ - __builtin_ia32_cmpordss((__v4sf)(M128_0), (__v4sf)(M128_1)) -#define _mm_cmpunord_ss(M128_0, M128_1) \ - __builtin_ia32_cmpunordss((__v4sf)(M128_0), (__v4sf)(M128_1)) - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse » memory ops ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_set1_ps(M128_0) ((__m128)(__v4sf){M128_0, M128_0, M128_0, M128_0}) -#define _mm_setzero_ps() ((__m128)(__v4sf){0}) -#define _mm_cvtss_f32(M128_0) (((__v4sf)(M128_0))[0]) -#define _mm_load_ps(FLOATPTR) (*(__m128 *)(FLOATPTR)) -#define _mm_loadu_ps(FLOATPTR) (*(__m128_u *)(FLOATPTR)) -#define _mm_set_ps(WHO, DESIGNED, THIS, SHEESH) \ - ((__m128)(__v4sf){SHEESH, THIS, DESIGNED, WHO}) -#define _mm_set_ss(FLOAT) ((__m128)(__v4sf){FLOAT, 0, 0, 0}) -#define _mm_load_ss(FLOATPTR) _mm_set_ss(*(FLOATPTR)) -#define _mm_store_ss(FLOATPTR, M128_0) ((FLOATPTR)[0] = ((__v4sf)(M128_0))[0]) -#define _mm_store_ps(FLOATPTR, M128_0) (*(__m128 *)(FLOATPTR) = (M128_0)) -#define _mm_storeu_ps(FLOATPTR, M128_0) (*(__m128_u *)(FLOATPTR) = (M128_0)) -#define _mm_shuffle_ps(M128_0, M128_1, MASK) \ - ((__m128)__builtin_ia32_shufps((__v4sf)(M128_0), (__v4sf)(M128_1), (MASK))) - -#ifdef __llvm__ -#define _mm_movehl_ps(M128_0, M128_1) \ - ((__m128)__builtin_shufflevector((__v4sf)(__m128)(M128_0), \ - (__v4sf)(__m128)(M128_1), 6, 7, 2, 3)) -/* intrinsics unstable & constantly breaking, consider ansi c or asm. */ -/* each version of llvm has a different incompatible impl for this one */ -#else -#define _mm_movehl_ps(M128_0, M128_1) \ - ((__m128)__builtin_ia32_movhlps((__v4sf)(__m128)(M128_0), \ - (__v4sf)(__m128)(M128_1))) -#define _mm_storel_pi(M64PTR, M128_0) \ - __builtin_ia32_storelps((__v2sf *)(M64PTR), (__v4sf)(M128_0)) -#endif - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse » cast ops ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_cvtps_epi32(M128_0) \ - ((__m128i)__builtin_ia32_cvtps2dq((__v4sf)(M128_0))) - -#ifdef __llvm__ -#define _mm_cvtepi32_ps(M128I_0) \ - ((__m128) __builtin_convertvector((__v4si)(__m128i)(M128I_0), __v4sf)) -#else -#define _mm_cvtepi32_ps(M128I_0) \ - ((__m128)__builtin_ia32_cvtdq2ps((__v4si)(M128I_0))) -#endif - -/*───────────────────────────────────────────────────────────────────────────│─╗ -│ cosmopolitan § it's a trap! » sse » misc ─╬─│┼ -╚────────────────────────────────────────────────────────────────────────────│*/ - -#define _mm_getcsr() (__builtin_ia32_stmxcsr()) -#define _mm_setcsr(U32CONF) (__builtin_ia32_ldmxcsr(U32CONF)) - -#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) -#define _MM_SET_ROUNDING_MODE(MODE) \ - (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (MODE))) - -#define XMM_DESTROY(VAR) \ - do { \ - if (!IsTrustworthy()) { \ - asm volatile("xorps\t%1,%0" : "=x"(VAR) : "0"(VAR)); \ - } \ - } while (0) - -/* -** Ternary: -** -** Integer: _mm_or_si128(_mm_and_si128(a, cond), _mm_andnot_si128(cond, b)) -** 32-bit float: _mm_or_ps(_mm_and_ps(a, cond), _mm_andnot_ps(cond, b)) -** 64-bit float: _mm_or_pd(_mm_and_pd(a, cond), _mm_andnot_pd(cond, b)) -** Integer (SSE4.1+): _mm_blendv_epi8(a, b, cond) -** 32-bit float (SSE4.1+): _mm_blendv_ps(a, b, cond) -** 64-bit float (SSE4.1+): _mm_blendv_pd(a, b, cond) -*/ - -#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ -#endif /* COSMOPOLITAN_LIBC_BITS_XMMINTRIN_H_ */ diff --git a/libc/isystem/ammintrin.h b/libc/isystem/ammintrin.h new file mode 100644 index 000000000..028098a89 --- /dev/null +++ b/libc/isystem/ammintrin.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_ +#include "third_party/intel/ammintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/avx2intrin.h b/libc/isystem/avx2intrin.h deleted file mode 100644 index 06f51d75d..000000000 --- a/libc/isystem/avx2intrin.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_ISYSTEM_AVX2INTRIN_H_ -#define COSMOPOLITAN_LIBC_ISYSTEM_AVX2INTRIN_H_ -#include "libc/intrin/avx2intrin.internal.h" -#endif /* COSMOPOLITAN_LIBC_ISYSTEM_AVX2INTRIN_H_ */ diff --git a/libc/isystem/avxintrin.h b/libc/isystem/avxintrin.h deleted file mode 100644 index 079159e22..000000000 --- a/libc/isystem/avxintrin.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_ISYSTEM_AVXINTRIN_H_ -#define COSMOPOLITAN_LIBC_ISYSTEM_AVXINTRIN_H_ -#include "libc/intrin/avxintrin.internal.h" -#endif /* COSMOPOLITAN_LIBC_ISYSTEM_AVXINTRIN_H_ */ diff --git a/libc/isystem/clzerointrin.h b/libc/isystem/clzerointrin.h new file mode 100644 index 000000000..5c0be5400 --- /dev/null +++ b/libc/isystem/clzerointrin.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_ +#include "third_party/intel/clzerointrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/cpuid.h b/libc/isystem/cpuid.h new file mode 100644 index 000000000..89ac3c77c --- /dev/null +++ b/libc/isystem/cpuid.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_CPUID_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_CPUID_INTERNAL_H_ +#include "third_party/intel/cpuid.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_CPUID_INTERNAL_H_ */ diff --git a/libc/isystem/dog.py b/libc/isystem/dog.py new file mode 100644 index 000000000..c9a4ca10e --- /dev/null +++ b/libc/isystem/dog.py @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_DOG_PY_ +#define COSMOPOLITAN_LIBC_ISYSTEM_DOG_PY_ +#include "third_party/intel/dog.py" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_DOG_PY_ */ diff --git a/libc/isystem/emmintrin.h b/libc/isystem/emmintrin.h index 80f8c5332..1c670b16a 100644 --- a/libc/isystem/emmintrin.h +++ b/libc/isystem/emmintrin.h @@ -1,4 +1,4 @@ -#ifndef COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_H_ -#include "libc/intrin/emmintrin.internal.h" -#endif /* COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_H_ */ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_ +#include "third_party/intel/emmintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/fun.py b/libc/isystem/fun.py new file mode 100644 index 000000000..00b7e78de --- /dev/null +++ b/libc/isystem/fun.py @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_FUN_PY_ +#define COSMOPOLITAN_LIBC_ISYSTEM_FUN_PY_ +#include "third_party/intel/fun.py" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_FUN_PY_ */ diff --git a/libc/isystem/immintrin.h b/libc/isystem/immintrin.h new file mode 100644 index 000000000..683eb5a7a --- /dev/null +++ b/libc/isystem/immintrin.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_ +#include "third_party/intel/immintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/mm3dnow.h b/libc/isystem/mm3dnow.h new file mode 100644 index 000000000..01ac6b25f --- /dev/null +++ b/libc/isystem/mm3dnow.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MM3DNOW_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_MM3DNOW_INTERNAL_H_ +#include "third_party/intel/mm3dnow.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MM3DNOW_INTERNAL_H_ */ diff --git a/libc/isystem/mm_malloc.h b/libc/isystem/mm_malloc.h new file mode 100644 index 000000000..7634fa6de --- /dev/null +++ b/libc/isystem/mm_malloc.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_ +#include "third_party/intel/mm_malloc.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_ */ diff --git a/libc/isystem/mmintrin.h b/libc/isystem/mmintrin.h new file mode 100644 index 000000000..af089e7c6 --- /dev/null +++ b/libc/isystem/mmintrin.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_ +#include "third_party/intel/mmintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/mwaitxintrin.h b/libc/isystem/mwaitxintrin.h new file mode 100644 index 000000000..42a5f3e72 --- /dev/null +++ b/libc/isystem/mwaitxintrin.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_ +#include "third_party/intel/mwaitxintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/nmmintrin.h b/libc/isystem/nmmintrin.h new file mode 100644 index 000000000..0a5ef7c98 --- /dev/null +++ b/libc/isystem/nmmintrin.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_ +#include "third_party/intel/nmmintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/pmmintrin.h b/libc/isystem/pmmintrin.h index 087f0b39a..21e098b7c 100644 --- a/libc/isystem/pmmintrin.h +++ b/libc/isystem/pmmintrin.h @@ -1,4 +1,4 @@ -#ifndef COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_H_ -#include "libc/intrin/pmmintrin.internal.h" -#endif /* COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_H_ */ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_ +#include "third_party/intel/pmmintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/popcntintrin.h b/libc/isystem/popcntintrin.h new file mode 100644 index 000000000..632667eb0 --- /dev/null +++ b/libc/isystem/popcntintrin.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_ +#include "third_party/intel/popcntintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/sgxintrin.h b/libc/isystem/sgxintrin.h new file mode 100644 index 000000000..0ba872436 --- /dev/null +++ b/libc/isystem/sgxintrin.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_ +#include "third_party/intel/sgxintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/shaintrin.h b/libc/isystem/shaintrin.h deleted file mode 100644 index 308744668..000000000 --- a/libc/isystem/shaintrin.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SHAINTRIN_H_ -#define COSMOPOLITAN_LIBC_ISYSTEM_SHAINTRIN_H_ -#include "libc/intrin/shaintrin.internal.h" -#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SHAINTRIN_H_ */ diff --git a/libc/isystem/smmintrin.h b/libc/isystem/smmintrin.h index b226f1571..fd7d9b648 100644 --- a/libc/isystem/smmintrin.h +++ b/libc/isystem/smmintrin.h @@ -1,4 +1,4 @@ -#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_H_ -#include "libc/intrin/smmintrin.internal.h" -#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_H_ */ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_ +#include "third_party/intel/smmintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/tmmintrin.h b/libc/isystem/tmmintrin.h index eebddd130..d1279467e 100644 --- a/libc/isystem/tmmintrin.h +++ b/libc/isystem/tmmintrin.h @@ -1,4 +1,4 @@ -#ifndef COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_H_ -#include "libc/intrin/tmmintrin.internal.h" -#endif /* COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_H_ */ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_ +#include "third_party/intel/tmmintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/vaesintrin.h b/libc/isystem/vaesintrin.h new file mode 100644 index 000000000..59c3838a2 --- /dev/null +++ b/libc/isystem/vaesintrin.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_VAESINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_VAESINTRIN_INTERNAL_H_ +#include "third_party/intel/vaesintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_VAESINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/wmmintrin.h b/libc/isystem/wmmintrin.h index ee565c59f..8c4f60e00 100644 --- a/libc/isystem/wmmintrin.h +++ b/libc/isystem/wmmintrin.h @@ -1,4 +1,4 @@ -#ifndef COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_H_ -#include "libc/intrin/wmmintrin.internal.h" -#endif /* COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_H_ */ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_ +#include "third_party/intel/wmmintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/x86intrin.h b/libc/isystem/x86intrin.h new file mode 100644 index 000000000..fb8c3f971 --- /dev/null +++ b/libc/isystem/x86intrin.h @@ -0,0 +1,4 @@ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_ +#include "third_party/intel/x86intrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_ */ diff --git a/libc/isystem/xmmintrin.h b/libc/isystem/xmmintrin.h index 1aa83665b..594e650fd 100644 --- a/libc/isystem/xmmintrin.h +++ b/libc/isystem/xmmintrin.h @@ -1,4 +1,4 @@ -#ifndef COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_H_ -#define COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_H_ -#include "libc/intrin/xmmintrin.internal.h" -#endif /* COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_H_ */ +#ifndef COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_ +#define COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_ +#include "third_party/intel/xmmintrin.internal.h" +#endif /* COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_ */ diff --git a/third_party/intel/adxintrin.internal.h b/third_party/intel/adxintrin.internal.h new file mode 100644 index 000000000..fbfbbbc89 --- /dev/null +++ b/third_party/intel/adxintrin.internal.h @@ -0,0 +1,52 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _ADXINTRIN_H_INCLUDED +#define _ADXINTRIN_H_INCLUDED + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _subborrow_u32(unsigned char __CF, unsigned int __X, unsigned int __Y, + unsigned int *__P) { + return __builtin_ia32_sbb_u32(__CF, __X, __Y, __P); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _addcarry_u32(unsigned char __CF, unsigned int __X, unsigned int __Y, + unsigned int *__P) { + return __builtin_ia32_addcarryx_u32(__CF, __X, __Y, __P); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _addcarryx_u32(unsigned char __CF, unsigned int __X, unsigned int __Y, + unsigned int *__P) { + return __builtin_ia32_addcarryx_u32(__CF, __X, __Y, __P); +} + +#ifdef __x86_64__ +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _subborrow_u64(unsigned char __CF, unsigned long long __X, + unsigned long long __Y, unsigned long long *__P) { + return __builtin_ia32_sbb_u64(__CF, __X, __Y, __P); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _addcarry_u64(unsigned char __CF, unsigned long long __X, + unsigned long long __Y, unsigned long long *__P) { + return __builtin_ia32_addcarryx_u64(__CF, __X, __Y, __P); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _addcarryx_u64(unsigned char __CF, unsigned long long __X, + unsigned long long __Y, unsigned long long *__P) { + return __builtin_ia32_addcarryx_u64(__CF, __X, __Y, __P); +} +#endif + +#endif /* _ADXINTRIN_H_INCLUDED */ diff --git a/third_party/intel/ammintrin.internal.h b/third_party/intel/ammintrin.internal.h new file mode 100644 index 000000000..ddbd8efa0 --- /dev/null +++ b/third_party/intel/ammintrin.internal.h @@ -0,0 +1,66 @@ +#ifndef _AMMINTRIN_H_INCLUDED +#define _AMMINTRIN_H_INCLUDED +#include "third_party/intel/pmmintrin.internal.h" + +#ifndef __SSE4A__ +#pragma GCC push_options +#pragma GCC target("sse4a") +#define __DISABLE_SSE4A__ +#endif /* __SSE4A__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_sd(double* __P, __m128d __Y) { + __builtin_ia32_movntsd(__P, (__v2df)__Y); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_ss(float* __P, __m128 __Y) { + __builtin_ia32_movntss(__P, (__v4sf)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_si64(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_extrq((__v2di)__X, (__v16qi)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_extracti_si64(__m128i __X, unsigned const int __I, unsigned const int __L) { + return (__m128i)__builtin_ia32_extrqi((__v2di)__X, __I, __L); +} +#else +#define _mm_extracti_si64(X, I, L) \ + ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(X), (unsigned int)(I), \ + (unsigned int)(L))) +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_si64(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_insertq((__v2di)__X, (__v2di)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I, + unsigned const int __L) { + return (__m128i)__builtin_ia32_insertqi((__v2di)__X, (__v2di)__Y, __I, __L); +} +#else +#define _mm_inserti_si64(X, Y, I, L) \ + ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (unsigned int)(I), \ + (unsigned int)(L))) +#endif + +#ifdef __DISABLE_SSE4A__ +#undef __DISABLE_SSE4A__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE4A__ */ + +#endif /* _AMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx2intrin.internal.h b/third_party/intel/avx2intrin.internal.h new file mode 100644 index 000000000..524eebd02 --- /dev/null +++ b/third_party/intel/avx2intrin.internal.h @@ -0,0 +1,1492 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX2INTRIN_H_INCLUDED +#define _AVX2INTRIN_H_INCLUDED + +#ifndef __AVX2__ +#pragma GCC push_options +#pragma GCC target("avx2") +#define __DISABLE_AVX2__ +#endif /* __AVX2__ */ + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mpsadbw_epu8(__m256i __X, __m256i __Y, const int __M) { + return (__m256i)__builtin_ia32_mpsadbw256((__v32qi)__X, (__v32qi)__Y, __M); +} +#else +#define _mm256_mpsadbw_epu8(X, Y, M) \ + ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_abs_epi8(__m256i __A) { + return (__m256i)__builtin_ia32_pabsb256((__v32qi)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_abs_epi16(__m256i __A) { + return (__m256i)__builtin_ia32_pabsw256((__v16hi)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_abs_epi32(__m256i __A) { + return (__m256i)__builtin_ia32_pabsd256((__v8si)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_packs_epi32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packssdw256((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_packs_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packsswb256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_packus_epi32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packusdw256((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_packus_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packuswb256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_add_epi8(__m256i __A, __m256i __B) { + return (__m256i)((__v32qu)__A + (__v32qu)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_add_epi16(__m256i __A, __m256i __B) { + return (__m256i)((__v16hu)__A + (__v16hu)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_add_epi32(__m256i __A, __m256i __B) { + return (__m256i)((__v8su)__A + (__v8su)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_add_epi64(__m256i __A, __m256i __B) { + return (__m256i)((__v4du)__A + (__v4du)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_adds_epi8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddsb256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_adds_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddsw256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_adds_epu8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddusb256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_adds_epu16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddusw256((__v16hi)__A, (__v16hi)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_alignr_epi8(__m256i __A, __m256i __B, const int __N) { + return (__m256i)__builtin_ia32_palignr256((__v4di)__A, (__v4di)__B, __N * 8); +} +#else +#define _mm256_alignr_epi8(A, B, N) \ + ((__m256i)__builtin_ia32_palignr256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(N)*8)) +#endif + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_and_si256(__m256i __A, __m256i __B) { + return (__m256i)((__v4du)__A & (__v4du)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_andnot_si256(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_andnotsi256((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_avg_epu8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pavgb256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_avg_epu16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pavgw256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_blendv_epi8(__m256i __X, __m256i __Y, __m256i __M) { + return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__X, (__v32qi)__Y, + (__v32qi)__M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_blend_epi16(__m256i __X, __m256i __Y, const int __M) { + return (__m256i)__builtin_ia32_pblendw256((__v16hi)__X, (__v16hi)__Y, __M); +} +#else +#define _mm256_blend_epi16(X, Y, M) \ + ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(X), \ + (__v16hi)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epi8(__m256i __A, __m256i __B) { + return (__m256i)((__v32qi)__A == (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epi16(__m256i __A, __m256i __B) { + return (__m256i)((__v16hi)__A == (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epi32(__m256i __A, __m256i __B) { + return (__m256i)((__v8si)__A == (__v8si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epi64(__m256i __A, __m256i __B) { + return (__m256i)((__v4di)__A == (__v4di)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epi8(__m256i __A, __m256i __B) { + return (__m256i)((__v32qi)__A > (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epi16(__m256i __A, __m256i __B) { + return (__m256i)((__v16hi)__A > (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epi32(__m256i __A, __m256i __B) { + return (__m256i)((__v8si)__A > (__v8si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epi64(__m256i __A, __m256i __B) { + return (__m256i)((__v4di)__A > (__v4di)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_hadd_epi16(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_phaddw256((__v16hi)__X, (__v16hi)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_hadd_epi32(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_phaddd256((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_hadds_epi16(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__X, (__v16hi)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_hsub_epi16(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_phsubw256((__v16hi)__X, (__v16hi)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_hsub_epi32(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_phsubd256((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_hsubs_epi16(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__X, (__v16hi)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maddubs_epi16(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__X, (__v32qi)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_madd_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_max_epi8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_max_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_max_epi32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_max_epu8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_max_epu16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_max_epu32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxud256((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_min_epi8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsb256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_min_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsw256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_min_epi32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsd256((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_min_epu8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminub256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_min_epu16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminuw256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_min_epu32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminud256((__v8si)__A, (__v8si)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movemask_epi8(__m256i __A) { + return __builtin_ia32_pmovmskb256((__v32qi)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi8_epi16(__m128i __X) { + return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi8_epi32(__m128i __X) { + return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi8_epi64(__m128i __X) { + return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi16_epi32(__m128i __X) { + return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi16_epi64(__m128i __X) { + return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi32_epi64(__m128i __X) { + return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepu8_epi16(__m128i __X) { + return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepu8_epi32(__m128i __X) { + return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepu8_epi64(__m128i __X) { + return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepu16_epi32(__m128i __X) { + return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepu16_epi64(__m128i __X) { + return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepu32_epi64(__m128i __X) { + return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mul_epi32(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pmuldq256((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mulhrs_epi16(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__X, (__v16hi)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mulhi_epu16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mulhi_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mullo_epi16(__m256i __A, __m256i __B) { + return (__m256i)((__v16hu)__A * (__v16hu)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mullo_epi32(__m256i __A, __m256i __B) { + return (__m256i)((__v8su)__A * (__v8su)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mul_epu32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmuludq256((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_or_si256(__m256i __A, __m256i __B) { + return (__m256i)((__v4du)__A | (__v4du)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sad_epu8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psadbw256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shuffle_epi8(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pshufb256((__v32qi)__X, (__v32qi)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shuffle_epi32(__m256i __A, const int __mask) { + return (__m256i)__builtin_ia32_pshufd256((__v8si)__A, __mask); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shufflehi_epi16(__m256i __A, const int __mask) { + return (__m256i)__builtin_ia32_pshufhw256((__v16hi)__A, __mask); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shufflelo_epi16(__m256i __A, const int __mask) { + return (__m256i)__builtin_ia32_pshuflw256((__v16hi)__A, __mask); +} +#else +#define _mm256_shuffle_epi32(A, N) \ + ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(A), (int)(N))) +#define _mm256_shufflehi_epi16(A, N) \ + ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(A), (int)(N))) +#define _mm256_shufflelo_epi16(A, N) \ + ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(A), (int)(N))) +#endif + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sign_epi8(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psignb256((__v32qi)__X, (__v32qi)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sign_epi16(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psignw256((__v16hi)__X, (__v16hi)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sign_epi32(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psignd256((__v8si)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_bslli_epi128(__m256i __A, const int __N) { + return (__m256i)__builtin_ia32_pslldqi256(__A, __N * 8); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_slli_si256(__m256i __A, const int __N) { + return (__m256i)__builtin_ia32_pslldqi256(__A, __N * 8); +} +#else +#define _mm256_bslli_epi128(A, N) \ + ((__m256i)__builtin_ia32_pslldqi256((__m256i)(A), (int)(N)*8)) +#define _mm256_slli_si256(A, N) \ + ((__m256i)__builtin_ia32_pslldqi256((__m256i)(A), (int)(N)*8)) +#endif + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_slli_epi16(__m256i __A, int __B) { + return (__m256i)__builtin_ia32_psllwi256((__v16hi)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sll_epi16(__m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_slli_epi32(__m256i __A, int __B) { + return (__m256i)__builtin_ia32_pslldi256((__v8si)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sll_epi32(__m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_slli_epi64(__m256i __A, int __B) { + return (__m256i)__builtin_ia32_psllqi256((__v4di)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sll_epi64(__m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srai_epi16(__m256i __A, int __B) { + return (__m256i)__builtin_ia32_psrawi256((__v16hi)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sra_epi16(__m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psraw256((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srai_epi32(__m256i __A, int __B) { + return (__m256i)__builtin_ia32_psradi256((__v8si)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sra_epi32(__m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrad256((__v8si)__A, (__v4si)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_bsrli_epi128(__m256i __A, const int __N) { + return (__m256i)__builtin_ia32_psrldqi256(__A, __N * 8); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srli_si256(__m256i __A, const int __N) { + return (__m256i)__builtin_ia32_psrldqi256(__A, __N * 8); +} +#else +#define _mm256_bsrli_epi128(A, N) \ + ((__m256i)__builtin_ia32_psrldqi256((__m256i)(A), (int)(N)*8)) +#define _mm256_srli_si256(A, N) \ + ((__m256i)__builtin_ia32_psrldqi256((__m256i)(A), (int)(N)*8)) +#endif + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srli_epi16(__m256i __A, int __B) { + return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srl_epi16(__m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srli_epi32(__m256i __A, int __B) { + return (__m256i)__builtin_ia32_psrldi256((__v8si)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srl_epi32(__m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srli_epi64(__m256i __A, int __B) { + return (__m256i)__builtin_ia32_psrlqi256((__v4di)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srl_epi64(__m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sub_epi8(__m256i __A, __m256i __B) { + return (__m256i)((__v32qu)__A - (__v32qu)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sub_epi16(__m256i __A, __m256i __B) { + return (__m256i)((__v16hu)__A - (__v16hu)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sub_epi32(__m256i __A, __m256i __B) { + return (__m256i)((__v8su)__A - (__v8su)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sub_epi64(__m256i __A, __m256i __B) { + return (__m256i)((__v4du)__A - (__v4du)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_subs_epi8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubsb256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_subs_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubsw256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_subs_epu8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubusb256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_subs_epu16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubusw256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpackhi_epi8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpckhbw256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpackhi_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpckhwd256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpackhi_epi32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpckhdq256((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpackhi_epi64(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpckhqdq256((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpacklo_epi8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpcklbw256((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpacklo_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpcklwd256((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpacklo_epi32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpckldq256((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpacklo_epi64(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpcklqdq256((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_xor_si256(__m256i __A, __m256i __B) { + return (__m256i)((__v4du)__A ^ (__v4du)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_stream_load_si256(__m256i const *__X) { + return (__m256i)__builtin_ia32_movntdqa256((__v4di *)__X); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_broadcastss_ps(__m128 __X) { + return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcastss_ps(__m128 __X) { + return (__m256)__builtin_ia32_vbroadcastss_ps256((__v4sf)__X); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcastsd_pd(__m128d __X) { + return (__m256d)__builtin_ia32_vbroadcastsd_pd256((__v2df)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcastsi128_si256(__m128i __X) { + return (__m256i)__builtin_ia32_vbroadcastsi256((__v2di)__X); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blend_epi32(__m128i __X, __m128i __Y, const int __M) { + return (__m128i)__builtin_ia32_pblendd128((__v4si)__X, (__v4si)__Y, __M); +} +#else +#define _mm_blend_epi32(X, Y, M) \ + ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_blend_epi32(__m256i __X, __m256i __Y, const int __M) { + return (__m256i)__builtin_ia32_pblendd256((__v8si)__X, (__v8si)__Y, __M); +} +#else +#define _mm256_blend_epi32(X, Y, M) \ + ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcastb_epi8(__m128i __X) { + return (__m256i)__builtin_ia32_pbroadcastb256((__v16qi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcastw_epi16(__m128i __X) { + return (__m256i)__builtin_ia32_pbroadcastw256((__v8hi)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcastd_epi32(__m128i __X) { + return (__m256i)__builtin_ia32_pbroadcastd256((__v4si)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcastq_epi64(__m128i __X) { + return (__m256i)__builtin_ia32_pbroadcastq256((__v2di)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_broadcastb_epi8(__m128i __X) { + return (__m128i)__builtin_ia32_pbroadcastb128((__v16qi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_broadcastw_epi16(__m128i __X) { + return (__m128i)__builtin_ia32_pbroadcastw128((__v8hi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_broadcastd_epi32(__m128i __X) { + return (__m128i)__builtin_ia32_pbroadcastd128((__v4si)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_broadcastq_epi64(__m128i __X) { + return (__m128i)__builtin_ia32_pbroadcastq128((__v2di)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutevar8x32_epi32(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_permvarsi256((__v8si)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permute4x64_pd(__m256d __X, const int __M) { + return (__m256d)__builtin_ia32_permdf256((__v4df)__X, __M); +} +#else +#define _mm256_permute4x64_pd(X, M) \ + ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(M))) +#endif + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutevar8x32_ps(__m256 __X, __m256i __Y) { + return (__m256)__builtin_ia32_permvarsf256((__v8sf)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permute4x64_epi64(__m256i __X, const int __M) { + return (__m256i)__builtin_ia32_permdi256((__v4di)__X, __M); +} +#else +#define _mm256_permute4x64_epi64(X, M) \ + ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permute2x128_si256(__m256i __X, __m256i __Y, const int __M) { + return (__m256i)__builtin_ia32_permti256((__v4di)__X, (__v4di)__Y, __M); +} +#else +#define _mm256_permute2x128_si256(X, Y, M) \ + ((__m256i)__builtin_ia32_permti256((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extracti128_si256(__m256i __X, const int __M) { + return (__m128i)__builtin_ia32_extract128i256((__v4di)__X, __M); +} +#else +#define _mm256_extracti128_si256(X, M) \ + ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(X), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_inserti128_si256(__m256i __X, __m128i __Y, const int __M) { + return (__m256i)__builtin_ia32_insert128i256((__v4di)__X, (__v2di)__Y, __M); +} +#else +#define _mm256_inserti128_si256(X, Y, M) \ + ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(X), \ + (__v2di)(__m128i)(Y), (int)(M))) +#endif + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskload_epi32(int const *__X, __m256i __M) { + return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskload_epi64(long long const *__X, __m256i __M) { + return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskload_epi32(int const *__X, __m128i __M) { + return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskload_epi64(long long const *__X, __m128i __M) { + return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) { + __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) { + __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) { + __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) { + __builtin_ia32_maskstoreq((__v2di *)__X, (__v2di)__M, (__v2di)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sllv_epi32(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sllv_epi32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sllv_epi64(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sllv_epi64(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srav_epi32(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srav_epi32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srlv_epi32(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srlv_epi32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srlv_epi64(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srlv_epi64(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i32gather_pd(double const *__base, __m128i __index, const int __scale) { + __v2df __zero = _mm_setzero_pd(); + __v2df __mask = _mm_cmpeq_pd(__zero, __zero); + + return (__m128d)__builtin_ia32_gathersiv2df(_mm_undefined_pd(), __base, + (__v4si)__index, __mask, __scale); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i32gather_pd(__m128d __src, double const *__base, __m128i __index, + __m128d __mask, const int __scale) { + return (__m128d)__builtin_ia32_gathersiv2df( + (__v2df)__src, __base, (__v4si)__index, (__v2df)__mask, __scale); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_i32gather_pd(double const *__base, __m128i __index, const int __scale) { + __v4df __zero = _mm256_setzero_pd(); + __v4df __mask = _mm256_cmp_pd(__zero, __zero, _CMP_EQ_OQ); + return (__m256d)__builtin_ia32_gathersiv4df(_mm256_undefined_pd(), __base, + (__v4si)__index, __mask, __scale); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i32gather_pd(__m256d __src, double const *__base, + __m128i __index, __m256d __mask, + const int __scale) { + return (__m256d)__builtin_ia32_gathersiv4df( + (__v4df)__src, __base, (__v4si)__index, (__v4df)__mask, __scale); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i64gather_pd(double const *__base, __m128i __index, const int __scale) { + __v2df __src = _mm_setzero_pd(); + __v2df __mask = _mm_cmpeq_pd(__src, __src); + return (__m128d)__builtin_ia32_gatherdiv2df(__src, __base, (__v2di)__index, + __mask, __scale); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i64gather_pd(__m128d __src, double const *__base, __m128i __index, + __m128d __mask, const int __scale) { + return (__m128d)__builtin_ia32_gatherdiv2df( + (__v2df)__src, __base, (__v2di)__index, (__v2df)__mask, __scale); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_i64gather_pd(double const *__base, __m256i __index, const int __scale) { + __v4df __src = _mm256_setzero_pd(); + __v4df __mask = _mm256_cmp_pd(__src, __src, _CMP_EQ_OQ); + return (__m256d)__builtin_ia32_gatherdiv4df(__src, __base, (__v4di)__index, + __mask, __scale); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i64gather_pd(__m256d __src, double const *__base, + __m256i __index, __m256d __mask, + const int __scale) { + return (__m256d)__builtin_ia32_gatherdiv4df( + (__v4df)__src, __base, (__v4di)__index, (__v4df)__mask, __scale); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i32gather_ps(float const *__base, __m128i __index, const int __scale) { + __v4sf __src = _mm_setzero_ps(); + __v4sf __mask = _mm_cmpeq_ps(__src, __src); + return (__m128)__builtin_ia32_gathersiv4sf(__src, __base, (__v4si)__index, + __mask, __scale); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i32gather_ps(__m128 __src, float const *__base, __m128i __index, + __m128 __mask, const int __scale) { + return (__m128)__builtin_ia32_gathersiv4sf( + (__v4sf)__src, __base, (__v4si)__index, (__v4sf)__mask, __scale); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_i32gather_ps(float const *__base, __m256i __index, const int __scale) { + __v8sf __src = _mm256_setzero_ps(); + __v8sf __mask = _mm256_cmp_ps(__src, __src, _CMP_EQ_OQ); + return (__m256)__builtin_ia32_gathersiv8sf(__src, __base, (__v8si)__index, + __mask, __scale); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i32gather_ps(__m256 __src, float const *__base, __m256i __index, + __m256 __mask, const int __scale) { + return (__m256)__builtin_ia32_gathersiv8sf( + (__v8sf)__src, __base, (__v8si)__index, (__v8sf)__mask, __scale); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i64gather_ps(float const *__base, __m128i __index, const int __scale) { + __v4sf __src = _mm_setzero_ps(); + __v4sf __mask = _mm_cmpeq_ps(__src, __src); + return (__m128)__builtin_ia32_gatherdiv4sf(__src, __base, (__v2di)__index, + __mask, __scale); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i64gather_ps(__m128 __src, float const *__base, __m128i __index, + __m128 __mask, const int __scale) { + return (__m128)__builtin_ia32_gatherdiv4sf( + (__v4sf)__src, __base, (__v2di)__index, (__v4sf)__mask, __scale); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_i64gather_ps(float const *__base, __m256i __index, const int __scale) { + __v4sf __src = _mm_setzero_ps(); + __v4sf __mask = _mm_cmpeq_ps(__src, __src); + return (__m128)__builtin_ia32_gatherdiv4sf256(__src, __base, (__v4di)__index, + __mask, __scale); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i64gather_ps(__m128 __src, float const *__base, __m256i __index, + __m128 __mask, const int __scale) { + return (__m128)__builtin_ia32_gatherdiv4sf256( + (__v4sf)__src, __base, (__v4di)__index, (__v4sf)__mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i32gather_epi64(long long int const *__base, __m128i __index, + const int __scale) { + __v2di __src = __extension__(__v2di){0, 0}; + __v2di __mask = __extension__(__v2di){~0, ~0}; + return (__m128i)__builtin_ia32_gathersiv2di(__src, __base, (__v4si)__index, + __mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i32gather_epi64(__m128i __src, long long int const *__base, + __m128i __index, __m128i __mask, + const int __scale) { + return (__m128i)__builtin_ia32_gathersiv2di( + (__v2di)__src, __base, (__v4si)__index, (__v2di)__mask, __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_i32gather_epi64(long long int const *__base, __m128i __index, + const int __scale) { + __v4di __src = __extension__(__v4di){0, 0, 0, 0}; + __v4di __mask = __extension__(__v4di){~0, ~0, ~0, ~0}; + return (__m256i)__builtin_ia32_gathersiv4di(__src, __base, (__v4si)__index, + __mask, __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i32gather_epi64(__m256i __src, long long int const *__base, + __m128i __index, __m256i __mask, + const int __scale) { + return (__m256i)__builtin_ia32_gathersiv4di( + (__v4di)__src, __base, (__v4si)__index, (__v4di)__mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i64gather_epi64(long long int const *__base, __m128i __index, + const int __scale) { + __v2di __src = __extension__(__v2di){0, 0}; + __v2di __mask = __extension__(__v2di){~0, ~0}; + return (__m128i)__builtin_ia32_gatherdiv2di(__src, __base, (__v2di)__index, + __mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i64gather_epi64(__m128i __src, long long int const *__base, + __m128i __index, __m128i __mask, + const int __scale) { + return (__m128i)__builtin_ia32_gatherdiv2di( + (__v2di)__src, __base, (__v2di)__index, (__v2di)__mask, __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_i64gather_epi64(long long int const *__base, __m256i __index, + const int __scale) { + __v4di __src = __extension__(__v4di){0, 0, 0, 0}; + __v4di __mask = __extension__(__v4di){~0, ~0, ~0, ~0}; + return (__m256i)__builtin_ia32_gatherdiv4di(__src, __base, (__v4di)__index, + __mask, __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i64gather_epi64(__m256i __src, long long int const *__base, + __m256i __index, __m256i __mask, + const int __scale) { + return (__m256i)__builtin_ia32_gatherdiv4di( + (__v4di)__src, __base, (__v4di)__index, (__v4di)__mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i32gather_epi32(int const *__base, __m128i __index, const int __scale) { + __v4si __src = __extension__(__v4si){0, 0, 0, 0}; + __v4si __mask = __extension__(__v4si){~0, ~0, ~0, ~0}; + return (__m128i)__builtin_ia32_gathersiv4si(__src, __base, (__v4si)__index, + __mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i32gather_epi32(__m128i __src, int const *__base, __m128i __index, + __m128i __mask, const int __scale) { + return (__m128i)__builtin_ia32_gathersiv4si( + (__v4si)__src, __base, (__v4si)__index, (__v4si)__mask, __scale); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_i32gather_epi32(int const *__base, __m256i __index, const int __scale) { + __v8si __src = __extension__(__v8si){0, 0, 0, 0, 0, 0, 0, 0}; + __v8si __mask = __extension__(__v8si){~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0}; + return (__m256i)__builtin_ia32_gathersiv8si(__src, __base, (__v8si)__index, + __mask, __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i32gather_epi32(__m256i __src, int const *__base, + __m256i __index, __m256i __mask, + const int __scale) { + return (__m256i)__builtin_ia32_gathersiv8si( + (__v8si)__src, __base, (__v8si)__index, (__v8si)__mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i64gather_epi32(int const *__base, __m128i __index, const int __scale) { + __v4si __src = __extension__(__v4si){0, 0, 0, 0}; + __v4si __mask = __extension__(__v4si){~0, ~0, ~0, ~0}; + return (__m128i)__builtin_ia32_gatherdiv4si(__src, __base, (__v2di)__index, + __mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i64gather_epi32(__m128i __src, int const *__base, __m128i __index, + __m128i __mask, const int __scale) { + return (__m128i)__builtin_ia32_gatherdiv4si( + (__v4si)__src, __base, (__v2di)__index, (__v4si)__mask, __scale); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_i64gather_epi32(int const *__base, __m256i __index, const int __scale) { + __v4si __src = __extension__(__v4si){0, 0, 0, 0}; + __v4si __mask = __extension__(__v4si){~0, ~0, ~0, ~0}; + return (__m128i)__builtin_ia32_gatherdiv4si256(__src, __base, (__v4di)__index, + __mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i64gather_epi32(__m128i __src, int const *__base, + __m256i __index, __m128i __mask, + const int __scale) { + return (__m128i)__builtin_ia32_gatherdiv4si256( + (__v4si)__src, __base, (__v4di)__index, (__v4si)__mask, __scale); +} +#else /* __OPTIMIZE__ */ +#define _mm_i32gather_pd(BASE, INDEX, SCALE) \ + (__m128d) __builtin_ia32_gathersiv2df( \ + (__v2df)_mm_setzero_pd(), (double const *)BASE, (__v4si)(__m128i)INDEX, \ + (__v2df)_mm_set1_pd((double)(long long int)-1), (int)SCALE) + +#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128d) __builtin_ia32_gathersiv2df( \ + (__v2df)(__m128d)SRC, (double const *)BASE, (__v4si)(__m128i)INDEX, \ + (__v2df)(__m128d)MASK, (int)SCALE) + +#define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ + (__m256d) __builtin_ia32_gathersiv4df( \ + (__v4df)_mm256_setzero_pd(), (double const *)BASE, \ + (__v4si)(__m128i)INDEX, \ + (__v4df)_mm256_set1_pd((double)(long long int)-1), (int)SCALE) + +#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256d) __builtin_ia32_gathersiv4df( \ + (__v4df)(__m256d)SRC, (double const *)BASE, (__v4si)(__m128i)INDEX, \ + (__v4df)(__m256d)MASK, (int)SCALE) + +#define _mm_i64gather_pd(BASE, INDEX, SCALE) \ + (__m128d) __builtin_ia32_gatherdiv2df( \ + (__v2df)_mm_setzero_pd(), (double const *)BASE, (__v2di)(__m128i)INDEX, \ + (__v2df)_mm_set1_pd((double)(long long int)-1), (int)SCALE) + +#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128d) __builtin_ia32_gatherdiv2df( \ + (__v2df)(__m128d)SRC, (double const *)BASE, (__v2di)(__m128i)INDEX, \ + (__v2df)(__m128d)MASK, (int)SCALE) + +#define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ + (__m256d) __builtin_ia32_gatherdiv4df( \ + (__v4df)_mm256_setzero_pd(), (double const *)BASE, \ + (__v4di)(__m256i)INDEX, \ + (__v4df)_mm256_set1_pd((double)(long long int)-1), (int)SCALE) + +#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256d) __builtin_ia32_gatherdiv4df( \ + (__v4df)(__m256d)SRC, (double const *)BASE, (__v4di)(__m256i)INDEX, \ + (__v4df)(__m256d)MASK, (int)SCALE) + +#define _mm_i32gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gathersiv4sf( \ + (__v4sf)_mm_setzero_ps(), (float const *)BASE, (__v4si)(__m128i)INDEX, \ + _mm_set1_ps((float)(int)-1), (int)SCALE) + +#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gathersiv4sf( \ + (__v4sf)(__m128d)SRC, (float const *)BASE, (__v4si)(__m128i)INDEX, \ + (__v4sf)(__m128d)MASK, (int)SCALE) + +#define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ + (__m256) __builtin_ia32_gathersiv8sf( \ + (__v8sf)_mm256_setzero_ps(), (float const *)BASE, \ + (__v8si)(__m256i)INDEX, (__v8sf)_mm256_set1_ps((float)(int)-1), \ + (int)SCALE) + +#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256) __builtin_ia32_gathersiv8sf( \ + (__v8sf)(__m256)SRC, (float const *)BASE, (__v8si)(__m256i)INDEX, \ + (__v8sf)(__m256d)MASK, (int)SCALE) + +#define _mm_i64gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf( \ + (__v4sf)_mm_setzero_pd(), (float const *)BASE, (__v2di)(__m128i)INDEX, \ + (__v4sf)_mm_set1_ps((float)(int)-1), (int)SCALE) + +#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf( \ + (__v4sf)(__m128)SRC, (float const *)BASE, (__v2di)(__m128i)INDEX, \ + (__v4sf)(__m128d)MASK, (int)SCALE) + +#define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf256( \ + (__v4sf)_mm_setzero_ps(), (float const *)BASE, (__v4di)(__m256i)INDEX, \ + (__v4sf)_mm_set1_ps((float)(int)-1), (int)SCALE) + +#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf256( \ + (__v4sf)(__m128)SRC, (float const *)BASE, (__v4di)(__m256i)INDEX, \ + (__v4sf)(__m128)MASK, (int)SCALE) + +#define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gathersiv2di( \ + (__v2di)_mm_setzero_si128(), (long long const *)BASE, \ + (__v4si)(__m128i)INDEX, (__v2di)_mm_set1_epi64x(-1), (int)SCALE) + +#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gathersiv2di( \ + (__v2di)(__m128i)SRC, (long long const *)BASE, (__v4si)(__m128i)INDEX, \ + (__v2di)(__m128i)MASK, (int)SCALE) + +#define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gathersiv4di( \ + (__v4di)_mm256_setzero_si256(), (long long const *)BASE, \ + (__v4si)(__m128i)INDEX, (__v4di)_mm256_set1_epi64x(-1), (int)SCALE) + +#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gathersiv4di( \ + (__v4di)(__m256i)SRC, (long long const *)BASE, (__v4si)(__m128i)INDEX, \ + (__v4di)(__m256i)MASK, (int)SCALE) + +#define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv2di( \ + (__v2di)_mm_setzero_si128(), (long long const *)BASE, \ + (__v2di)(__m128i)INDEX, (__v2di)_mm_set1_epi64x(-1), (int)SCALE) + +#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv2di( \ + (__v2di)(__m128i)SRC, (long long const *)BASE, (__v2di)(__m128i)INDEX, \ + (__v2di)(__m128i)MASK, (int)SCALE) + +#define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv4di( \ + (__v4di)_mm256_setzero_si256(), (long long const *)BASE, \ + (__v4di)(__m256i)INDEX, (__v4di)_mm256_set1_epi64x(-1), (int)SCALE) + +#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv4di( \ + (__v4di)(__m256i)SRC, (long long const *)BASE, (__v4di)(__m256i)INDEX, \ + (__v4di)(__m256i)MASK, (int)SCALE) + +#define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gathersiv4si( \ + (__v4si)_mm_setzero_si128(), (int const *)BASE, (__v4si)(__m128i)INDEX, \ + (__v4si)_mm_set1_epi32(-1), (int)SCALE) + +#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gathersiv4si( \ + (__v4si)(__m128i)SRC, (int const *)BASE, (__v4si)(__m128i)INDEX, \ + (__v4si)(__m128i)MASK, (int)SCALE) + +#define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gathersiv8si( \ + (__v8si)_mm256_setzero_si256(), (int const *)BASE, \ + (__v8si)(__m256i)INDEX, (__v8si)_mm256_set1_epi32(-1), (int)SCALE) + +#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gathersiv8si( \ + (__v8si)(__m256i)SRC, (int const *)BASE, (__v8si)(__m256i)INDEX, \ + (__v8si)(__m256i)MASK, (int)SCALE) + +#define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si( \ + (__v4si)_mm_setzero_si128(), (int const *)BASE, (__v2di)(__m128i)INDEX, \ + (__v4si)_mm_set1_epi32(-1), (int)SCALE) + +#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si( \ + (__v4si)(__m128i)SRC, (int const *)BASE, (__v2di)(__m128i)INDEX, \ + (__v4si)(__m128i)MASK, (int)SCALE) + +#define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si256( \ + (__v4si)_mm_setzero_si128(), (int const *)BASE, (__v4di)(__m256i)INDEX, \ + (__v4si)_mm_set1_epi32(-1), (int)SCALE) + +#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si256( \ + (__v4si)(__m128i)SRC, (int const *)BASE, (__v4di)(__m256i)INDEX, \ + (__v4si)(__m128i)MASK, (int)SCALE) +#endif /* __OPTIMIZE__ */ + +#ifdef __DISABLE_AVX2__ +#undef __DISABLE_AVX2__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX2__ */ + +#endif /* _AVX2INTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx5124fmapsintrin.internal.h b/third_party/intel/avx5124fmapsintrin.internal.h new file mode 100644 index 000000000..0c421fbd5 --- /dev/null +++ b/third_party/intel/avx5124fmapsintrin.internal.h @@ -0,0 +1,128 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include instead." +#endif + +#ifndef _AVX5124FMAPSINTRIN_H_INCLUDED +#define _AVX5124FMAPSINTRIN_H_INCLUDED + +#ifndef __AVX5124FMAPS__ +#pragma GCC push_options +#pragma GCC target("avx5124fmaps") +#define __DISABLE_AVX5124FMAPS__ +#endif /* __AVX5124FMAPS__ */ + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_4fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __m512 __D, __m512 __E, + __m128 *__F) { + return (__m512)__builtin_ia32_4fmaddps((__v16sf)__B, (__v16sf)__C, + (__v16sf)__D, (__v16sf)__E, + (__v16sf)__A, (const __v4sf *)__F); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_4fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) { + return (__m512)__builtin_ia32_4fmaddps_mask( + (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, + (const __v4sf *)__F, (__v16sf)__A, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_4fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) { + return (__m512)__builtin_ia32_4fmaddps_mask( + (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, + (const __v4sf *)__F, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_4fmadd_ss(__m128 __A, __m128 __B, __m128 __C, __m128 __D, __m128 __E, + __m128 *__F) { + return (__m128)__builtin_ia32_4fmaddss((__v4sf)__B, (__v4sf)__C, (__v4sf)__D, + (__v4sf)__E, (__v4sf)__A, + (const __v4sf *)__F); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_4fmadd_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) { + return (__m128)__builtin_ia32_4fmaddss_mask( + (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, + (const __v4sf *)__F, (__v4sf)__A, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_4fmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) { + return (__m128)__builtin_ia32_4fmaddss_mask( + (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, + (const __v4sf *)__F, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_4fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __m512 __D, + __m512 __E, __m128 *__F) { + return (__m512)__builtin_ia32_4fnmaddps((__v16sf)__B, (__v16sf)__C, + (__v16sf)__D, (__v16sf)__E, + (__v16sf)__A, (const __v4sf *)__F); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_4fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) { + return (__m512)__builtin_ia32_4fnmaddps_mask( + (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, + (const __v4sf *)__F, (__v16sf)__A, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_4fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) { + return (__m512)__builtin_ia32_4fnmaddps_mask( + (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, + (const __v4sf *)__F, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_4fnmadd_ss(__m128 __A, __m128 __B, __m128 __C, __m128 __D, __m128 __E, + __m128 *__F) { + return (__m128)__builtin_ia32_4fnmaddss((__v4sf)__B, (__v4sf)__C, (__v4sf)__D, + (__v4sf)__E, (__v4sf)__A, + (const __v4sf *)__F); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_4fnmadd_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) { + return (__m128)__builtin_ia32_4fnmaddss_mask( + (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, + (const __v4sf *)__F, (__v4sf)__A, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_4fnmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) { + return (__m128)__builtin_ia32_4fnmaddss_mask( + (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, + (const __v4sf *)__F, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +#ifdef __DISABLE_AVX5124FMAPS__ +#undef __DISABLE_AVX5124FMAPS__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX5124FMAPS__ */ + +#endif /* _AVX5124FMAPSINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx5124vnniwintrin.internal.h b/third_party/intel/avx5124vnniwintrin.internal.h new file mode 100644 index 000000000..f3c027f16 --- /dev/null +++ b/third_party/intel/avx5124vnniwintrin.internal.h @@ -0,0 +1,78 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include instead." +#endif + +#ifndef _AVX5124VNNIWINTRIN_H_INCLUDED +#define _AVX5124VNNIWINTRIN_H_INCLUDED + +#ifndef __AVX5124VNNIW__ +#pragma GCC push_options +#pragma GCC target("avx5124vnniw") +#define __DISABLE_AVX5124VNNIW__ +#endif /* __AVX5124VNNIW__ */ + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_4dpwssd_epi32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, + __m512i __E, __m128i *__F) { + return (__m512i)__builtin_ia32_vp4dpwssd((__v16si)__B, (__v16si)__C, + (__v16si)__D, (__v16si)__E, + (__v16si)__A, (const __v4si *)__F); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_4dpwssd_epi32(__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) { + return (__m512i)__builtin_ia32_vp4dpwssd_mask( + (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, + (const __v4si *)__F, (__v16si)__A, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_4dpwssd_epi32(__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) { + return (__m512i)__builtin_ia32_vp4dpwssd_mask( + (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, + (const __v4si *)__F, (__v16si)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_4dpwssds_epi32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, + __m512i __E, __m128i *__F) { + return (__m512i)__builtin_ia32_vp4dpwssds((__v16si)__B, (__v16si)__C, + (__v16si)__D, (__v16si)__E, + (__v16si)__A, (const __v4si *)__F); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_4dpwssds_epi32(__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) { + return (__m512i)__builtin_ia32_vp4dpwssds_mask( + (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, + (const __v4si *)__F, (__v16si)__A, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_4dpwssds_epi32(__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) { + return (__m512i)__builtin_ia32_vp4dpwssds_mask( + (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, + (const __v4si *)__F, (__v16si)_mm512_setzero_ps(), (__mmask16)__U); +} + +#ifdef __DISABLE_AVX5124VNNIW__ +#undef __DISABLE_AVX5124VNNIW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX5124VNNIW__ */ + +#endif /* _AVX5124VNNIWINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512bitalgintrin.internal.h b/third_party/intel/avx512bitalgintrin.internal.h new file mode 100644 index 000000000..e8ea2bd82 --- /dev/null +++ b/third_party/intel/avx512bitalgintrin.internal.h @@ -0,0 +1,213 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include instead." +#endif + +#ifndef _AVX512BITALGINTRIN_H_INCLUDED +#define _AVX512BITALGINTRIN_H_INCLUDED + +#ifndef __AVX512BITALG__ +#pragma GCC push_options +#pragma GCC target("avx512bitalg") +#define __DISABLE_AVX512BITALG__ +#endif /* __AVX512BITALG__ */ + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_popcnt_epi8(__m512i __A) { + return (__m512i)__builtin_ia32_vpopcountb_v64qi((__v64qi)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_popcnt_epi16(__m512i __A) { + return (__m512i)__builtin_ia32_vpopcountw_v32hi((__v32hi)__A); +} + +#ifdef __DISABLE_AVX512BITALG__ +#undef __DISABLE_AVX512BITALG__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BITALG__ */ + +#if !defined(__AVX512BITALG__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("avx512bitalg,avx512bw") +#define __DISABLE_AVX512BITALGBW__ +#endif /* __AVX512VLBW__ */ + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) { + return (__m512i)__builtin_ia32_vpopcountb_v64qi_mask( + (__v64qi)__A, (__v64qi)__B, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vpopcountb_v64qi_mask( + (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); +} +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) { + return (__m512i)__builtin_ia32_vpopcountw_v32hi_mask( + (__v32hi)__A, (__v32hi)__B, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vpopcountw_v32hi_mask( + (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_vpshufbitqmb512_mask( + (__v64qi)__A, (__v64qi)__B, (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_bitshuffle_epi64_mask(__mmask64 __M, __m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_vpshufbitqmb512_mask( + (__v64qi)__A, (__v64qi)__B, (__mmask64)__M); +} + +#ifdef __DISABLE_AVX512BITALGBW__ +#undef __DISABLE_AVX512BITALGBW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BITALGBW__ */ + +#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || \ + !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("avx512bitalg,avx512vl,avx512bw") +#define __DISABLE_AVX512BITALGVLBW__ +#endif /* __AVX512VLBW__ */ + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) { + return (__m256i)__builtin_ia32_vpopcountb_v32qi_mask( + (__v32qi)__A, (__v32qi)__B, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vpopcountb_v32qi_mask( + (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_vpshufbitqmb256_mask( + (__v32qi)__A, (__v32qi)__B, (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_bitshuffle_epi64_mask(__mmask32 __M, __m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_vpshufbitqmb256_mask( + (__v32qi)__A, (__v32qi)__B, (__mmask32)__M); +} + +#ifdef __DISABLE_AVX512BITALGVLBW__ +#undef __DISABLE_AVX512BITALGVLBW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BITALGVLBW__ */ + +#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) +#pragma GCC push_options +#pragma GCC target("avx512bitalg,avx512vl") +#define __DISABLE_AVX512BITALGVL__ +#endif /* __AVX512VLBW__ */ + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_vpshufbitqmb128_mask( + (__v16qi)__A, (__v16qi)__B, (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_bitshuffle_epi64_mask(__mmask16 __M, __m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_vpshufbitqmb128_mask( + (__v16qi)__A, (__v16qi)__B, (__mmask16)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_popcnt_epi8(__m256i __A) { + return (__m256i)__builtin_ia32_vpopcountb_v32qi((__v32qi)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_popcnt_epi16(__m256i __A) { + return (__m256i)__builtin_ia32_vpopcountw_v16hi((__v16hi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_popcnt_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_vpopcountb_v16qi((__v16qi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_popcnt_epi16(__m128i __A) { + return (__m128i)__builtin_ia32_vpopcountw_v8hi((__v8hi)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) { + return (__m256i)__builtin_ia32_vpopcountw_v16hi_mask( + (__v16hi)__A, (__v16hi)__B, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vpopcountw_v16hi_mask( + (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) { + return (__m128i)__builtin_ia32_vpopcountb_v16qi_mask( + (__v16qi)__A, (__v16qi)__B, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vpopcountb_v16qi_mask( + (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) { + return (__m128i)__builtin_ia32_vpopcountw_v8hi_mask((__v8hi)__A, (__v8hi)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vpopcountw_v8hi_mask( + (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} +#ifdef __DISABLE_AVX512BITALGVL__ +#undef __DISABLE_AVX512BITALGVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BITALGBW__ */ + +#endif /* _AVX512BITALGINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512bwintrin.internal.h b/third_party/intel/avx512bwintrin.internal.h new file mode 100644 index 000000000..86356d265 --- /dev/null +++ b/third_party/intel/avx512bwintrin.internal.h @@ -0,0 +1,2454 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512BWINTRIN_H_INCLUDED +#define _AVX512BWINTRIN_H_INCLUDED + +#ifndef __AVX512BW__ +#pragma GCC push_options +#pragma GCC target("avx512bw") +#define __DISABLE_AVX512BW__ +#endif /* __AVX512BW__ */ + +typedef short __v32hi __attribute__((__vector_size__(64))); +typedef char __v64qi __attribute__((__vector_size__(64))); + +typedef unsigned long long __mmask64; + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__CF) { + *__CF = (unsigned char)__builtin_ia32_ktestcsi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__CF) { + *__CF = (unsigned char)__builtin_ia32_ktestcdi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) { + return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) { + return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) { + return (unsigned char)__builtin_ia32_ktestcsi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) { + return (unsigned char)__builtin_ia32_ktestcdi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__CF) { + *__CF = (unsigned char)__builtin_ia32_kortestcsi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__CF) { + *__CF = (unsigned char)__builtin_ia32_kortestcdi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) { + return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) { + return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) { + return (unsigned char)__builtin_ia32_kortestcsi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) { + return (unsigned char)__builtin_ia32_kortestcdi(__A, __B); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kadd_mask32(__mmask32 __A, __mmask32 __B) { + return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kadd_mask64(__mmask64 __A, __mmask64 __B) { + return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cvtmask32_u32(__mmask32 __A) { + return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cvtmask64_u64(__mmask64 __A) { + return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cvtu32_mask32(unsigned int __A) { + return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cvtu64_mask64(unsigned long long __A) { + return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _load_mask32(__mmask32 *__A) { + return (__mmask32)__builtin_ia32_kmovd(*__A); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _load_mask64(__mmask64 *__A) { + return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _store_mask32(__mmask32 *__A, __mmask32 __B) { + *(__mmask32 *)__A = __builtin_ia32_kmovd(__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _store_mask64(__mmask64 *__A, __mmask64 __B) { + *(__mmask64 *)__A = __builtin_ia32_kmovq(__B); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _knot_mask32(__mmask32 __A) { + return (__mmask32)__builtin_ia32_knotsi((__mmask32)__A); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _knot_mask64(__mmask64 __A) { + return (__mmask64)__builtin_ia32_knotdi((__mmask64)__A); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kor_mask32(__mmask32 __A, __mmask32 __B) { + return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kor_mask64(__mmask64 __A, __mmask64 __B) { + return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kxnor_mask32(__mmask32 __A, __mmask32 __B) { + return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kxnor_mask64(__mmask64 __A, __mmask64 __B) { + return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kxor_mask32(__mmask32 __A, __mmask32 __B) { + return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kxor_mask64(__mmask64 __A, __mmask64 __B) { + return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kand_mask32(__mmask32 __A, __mmask32 __B) { + return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kand_mask64(__mmask64 __A, __mmask64 __B) { + return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kandn_mask32(__mmask32 __A, __mmask32 __B) { + return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kandn_mask64(__mmask64 __A, __mmask64 __B) { + return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) { + return (__m512i)__builtin_ia32_movdquhi512_mask((__v32hi)__A, (__v32hi)__W, + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) { + return (__m512i)__builtin_ia32_movdquhi512_mask( + (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) { + return (__m512i)__builtin_ia32_loaddquhi512_mask( + (const short *)__P, (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_loadu_epi16(__mmask32 __U, void const *__P) { + return (__m512i)__builtin_ia32_loaddquhi512_mask( + (const short *)__P, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_storeu_epi16(void *__P, __mmask32 __U, __m512i __A) { + __builtin_ia32_storedquhi512_mask((short *)__P, (__v32hi)__A, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) { + return (__m512i)__builtin_ia32_movdquqi512_mask((__v64qi)__A, (__v64qi)__W, + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) { + return (__m512i)__builtin_ia32_movdquqi512_mask( + (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kunpackw(__mmask32 __A, __mmask32 __B) { + return (__mmask32)__builtin_ia32_kunpcksi((__mmask32)__A, (__mmask32)__B); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kunpackw_mask32(__mmask16 __A, __mmask16 __B) { + return (__mmask32)__builtin_ia32_kunpcksi((__mmask32)__A, (__mmask32)__B); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kunpackd(__mmask64 __A, __mmask64 __B) { + return (__mmask64)__builtin_ia32_kunpckdi((__mmask64)__A, (__mmask64)__B); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kunpackd_mask64(__mmask32 __A, __mmask32 __B) { + return (__mmask64)__builtin_ia32_kunpckdi((__mmask64)__A, (__mmask64)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_loadu_epi8(__m512i __W, __mmask64 __U, void const *__P) { + return (__m512i)__builtin_ia32_loaddquqi512_mask( + (const char *)__P, (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) { + return (__m512i)__builtin_ia32_loaddquqi512_mask( + (const char *)__P, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_storeu_epi8(void *__P, __mmask64 __U, __m512i __A) { + __builtin_ia32_storedquqi512_mask((char *)__P, (__v64qi)__A, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sad_epu8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psadbw512((__v64qi)__A, (__v64qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi16_epi8(__m512i __A) { + return (__m256i)__builtin_ia32_pmovwb512_mask( + (__v32hi)__A, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi16_storeu_epi8(void *__P, __mmask32 __M, __m512i __A) { + __builtin_ia32_pmovwb512mem_mask((__v32qi *)__P, (__v32hi)__A, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi16_epi8(__m256i __O, __mmask32 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovwb512_mask((__v32hi)__A, (__v32qi)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi16_epi8(__mmask32 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovwb512_mask( + (__v32hi)__A, (__v32qi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtsepi16_epi8(__m512i __A) { + return (__m256i)__builtin_ia32_pmovswb512_mask( + (__v32hi)__A, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi16_storeu_epi8(void *__P, __mmask32 __M, __m512i __A) { + __builtin_ia32_pmovswb512mem_mask((__v32qi *)__P, (__v32hi)__A, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi16_epi8(__m256i __O, __mmask32 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovswb512_mask((__v32hi)__A, (__v32qi)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtsepi16_epi8(__mmask32 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovswb512_mask( + (__v32hi)__A, (__v32qi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtusepi16_epi8(__m512i __A) { + return (__m256i)__builtin_ia32_pmovuswb512_mask( + (__v32hi)__A, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi16_epi8(__m256i __O, __mmask32 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovuswb512_mask((__v32hi)__A, (__v32qi)__O, + __M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi16_storeu_epi8(void *__P, __mmask32 __M, __m512i __A) { + __builtin_ia32_pmovuswb512mem_mask((__v32qi *)__P, (__v32hi)__A, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtusepi16_epi8(__mmask32 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovuswb512_mask( + (__v32hi)__A, (__v32qi)_mm256_setzero_si256(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcastb_epi8(__m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastb512_mask( + (__v16qi)__A, (__v64qi)_mm512_undefined_epi32(), (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, __m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastb512_mask((__v16qi)__A, (__v64qi)__O, + __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastb512_mask( + (__v16qi)__A, (__v64qi)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) { + return (__m512i)__builtin_ia32_pbroadcastb512_gpr_mask(__A, (__v64qi)__O, + __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_set1_epi8(__mmask64 __M, char __A) { + return (__m512i)__builtin_ia32_pbroadcastb512_gpr_mask( + __A, (__v64qi)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcastw_epi16(__m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastw512_mask( + (__v8hi)__A, (__v32hi)_mm512_undefined_epi32(), (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastw512_mask((__v8hi)__A, (__v32hi)__O, + __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastw512_mask( + (__v8hi)__A, (__v32hi)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) { + return (__m512i)__builtin_ia32_pbroadcastw512_gpr_mask(__A, (__v32hi)__O, + __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_set1_epi16(__mmask32 __M, short __A) { + return (__m512i)__builtin_ia32_pbroadcastw512_gpr_mask( + __A, (__v32hi)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mulhrs_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhrsw512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhrsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhrsw512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mulhi_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mulhi_epu16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhuw512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhuw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulhuw512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mullo_epi16(__m512i __A, __m512i __B) { + return (__m512i)((__v32hu)__A * (__v32hu)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmullw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmullw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi8_epi16(__m256i __A) { + return (__m512i)__builtin_ia32_pmovsxbw512_mask( + (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) { + return (__m512i)__builtin_ia32_pmovsxbw512_mask((__v32qi)__A, (__v32hi)__W, + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) { + return (__m512i)__builtin_ia32_pmovsxbw512_mask( + (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu8_epi16(__m256i __A) { + return (__m512i)__builtin_ia32_pmovzxbw512_mask( + (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) { + return (__m512i)__builtin_ia32_pmovzxbw512_mask((__v32qi)__A, (__v32hi)__W, + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) { + return (__m512i)__builtin_ia32_pmovzxbw512_mask( + (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutexvar_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_permvarhi512_mask( + (__v32hi)__B, (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_permvarhi512_mask( + (__v32hi)__B, (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_permvarhi512_mask( + (__v32hi)__B, (__v32hi)__A, (__v32hi)__W, (__mmask32)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2varhi512_mask( + (__v32hi)__I + /* idx */, + (__v32hi)__A, (__v32hi)__B, (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2varhi512_mask( + (__v32hi)__I + /* idx */, + (__v32hi)__A, (__v32hi)__B, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermi2varhi512_mask((__v32hi)__A, + (__v32hi)__I + /* idx */, + (__v32hi)__B, + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2varhi512_maskz( + (__v32hi)__I + /* idx */, + (__v32hi)__A, (__v32hi)__B, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_avg_epu8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pavgb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_avg_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pavgb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_avg_epu8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pavgb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_add_epi8(__m512i __A, __m512i __B) { + return (__m512i)((__v64qu)__A + (__v64qu)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sub_epi8(__m512i __A, __m512i __B) { + return (__m512i)((__v64qu)__A - (__v64qu)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_avg_epu16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pavgw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_avg_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pavgw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_avg_epu16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pavgw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_subs_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_subs_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_subs_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_subs_epu8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubusb512_mask( + (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubusb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubusb512_mask( + (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_adds_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_adds_epu8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddusb512_mask( + (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_adds_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddusb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddusb512_mask( + (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sub_epi16(__m512i __A, __m512i __B) { + return (__m512i)((__v32hu)__A - (__v32hu)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_subs_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_subs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_subs_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_subs_epu16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubusw512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_subs_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubusw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubusw512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_add_epi16(__m512i __A, __m512i __B) { + return (__m512i)((__v32hu)__A + (__v32hu)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_adds_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_adds_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_adds_epu16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddusw512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddusw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddusw512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srl_epi16(__m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrlw512_mask((__v32hi)__A, (__v8hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrlw512_mask((__v32hi)__A, (__v8hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrlw512_mask((__v32hi)__A, (__v8hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_packs_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packsswb512_mask( + (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sll_epi16(__m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psllw512_mask((__v32hi)__A, (__v8hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psllw512_mask((__v32hi)__A, (__v8hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psllw512_mask((__v32hi)__A, (__v8hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maddubs_epi16(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_pmaddubsw512_mask( + (__v64qi)__X, (__v64qi)__Y, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, + __m512i __Y) { + return (__m512i)__builtin_ia32_pmaddubsw512_mask( + (__v64qi)__X, (__v64qi)__Y, (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_pmaddubsw512_mask( + (__v64qi)__X, (__v64qi)__Y, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_madd_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaddwd512_mask( + (__v32hi)__A, (__v32hi)__B, (__v16si)_mm512_setzero_si512(), + (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaddwd512_mask((__v32hi)__A, (__v32hi)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaddwd512_mask( + (__v32hi)__A, (__v32hi)__B, (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpackhi_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpckhbw512_mask( + (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_punpckhbw512_mask( + (__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpckhbw512_mask( + (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpackhi_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpckhwd512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_punpckhwd512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpckhwd512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpacklo_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpcklbw512_mask( + (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_punpcklbw512_mask( + (__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpcklbw512_mask( + (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpacklo_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpcklwd512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_punpcklwd512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpcklwd512_mask( + (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpeq_epu8_mask(__m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 0, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpeq_epi8_mask(__m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__A, (__v64qi)__B, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpeq_epu8_mask(__mmask64 __U, __m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 0, + __U); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpeq_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__A, (__v64qi)__B, + __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpeq_epu16_mask(__m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 0, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpeq_epi16_mask(__m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__A, (__v32hi)__B, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpeq_epu16_mask(__mmask32 __U, __m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 0, + __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpeq_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__A, (__v32hi)__B, + __U); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpgt_epu8_mask(__m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 6, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpgt_epi8_mask(__m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__A, (__v64qi)__B, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpgt_epu8_mask(__mmask64 __U, __m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 6, + __U); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpgt_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__A, (__v64qi)__B, + __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpgt_epu16_mask(__m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 6, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpgt_epi16_mask(__m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__A, (__v32hi)__B, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpgt_epu16_mask(__mmask32 __U, __m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 6, + __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpgt_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__A, (__v32hi)__B, + __U); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_movepi8_mask(__m512i __A) { + return (__mmask64)__builtin_ia32_cvtb2mask512((__v64qi)__A); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_movepi16_mask(__m512i __A) { + return (__mmask32)__builtin_ia32_cvtw2mask512((__v32hi)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_movm_epi8(__mmask64 __A) { + return (__m512i)__builtin_ia32_cvtmask2b512(__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_movm_epi16(__mmask32 __A) { + return (__m512i)__builtin_ia32_cvtmask2w512(__A); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_test_epi8_mask(__m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_ptestmb512((__v64qi)__A, (__v64qi)__B, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_test_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_ptestmb512((__v64qi)__A, (__v64qi)__B, __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_test_epi16_mask(__m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_ptestmw512((__v32hi)__A, (__v32hi)__B, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_test_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_ptestmw512((__v32hi)__A, (__v32hi)__B, __U); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_testn_epi8_mask(__m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_ptestnmb512((__v64qi)__A, (__v64qi)__B, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_testn_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) { + return (__mmask64)__builtin_ia32_ptestnmb512((__v64qi)__A, (__v64qi)__B, __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_testn_epi16_mask(__m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_ptestnmw512((__v32hi)__A, (__v32hi)__B, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_testn_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) { + return (__mmask32)__builtin_ia32_ptestnmw512((__v32hi)__A, (__v32hi)__B, __U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shuffle_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pshufb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pshufb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pshufb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_epu16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminuw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_epu16(__mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminuw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminuw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_epi16(__mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_min_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_epu8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxub512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_epu8(__mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxub512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_max_epu8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxub512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_epi8(__mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_max_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_epu8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminub512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_epu8(__mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminub512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_min_epu8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminub512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_epi8(__mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)_mm512_setzero_si512(), + (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_min_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsb512_mask((__v64qi)__A, (__v64qi)__B, + (__v64qi)__W, (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_epi16(__mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_max_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_epu16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxuw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_epu16(__mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxuw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_max_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxuw512_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sra_epi16(__m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psraw512_mask((__v32hi)__A, (__v8hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psraw512_mask((__v32hi)__A, (__v8hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psraw512_mask((__v32hi)__A, (__v8hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srav_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psrav32hi_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psrav32hi_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psrav32hi_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srlv_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psrlv32hi_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psrlv32hi_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psrlv32hi_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sllv_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psllv32hi_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_sllv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psllv32hi_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psllv32hi_mask((__v32hi)__A, (__v32hi)__B, + (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packsswb512_mask((__v32hi)__A, (__v32hi)__B, + (__v64qi)__W, (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packsswb512_mask( + (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_packus_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packuswb512_mask( + (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packuswb512_mask((__v32hi)__A, (__v32hi)__B, + (__v64qi)__W, (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packuswb512_mask( + (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), + (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_abs_epi8(__m512i __A) { + return (__m512i)__builtin_ia32_pabsb512_mask( + (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_abs_epi8(__m512i __W, __mmask64 __U, __m512i __A) { + return (__m512i)__builtin_ia32_pabsb512_mask((__v64qi)__A, (__v64qi)__W, + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_abs_epi8(__mmask64 __U, __m512i __A) { + return (__m512i)__builtin_ia32_pabsb512_mask( + (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_abs_epi16(__m512i __A) { + return (__m512i)__builtin_ia32_pabsw512_mask( + (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_abs_epi16(__m512i __W, __mmask32 __U, __m512i __A) { + return (__m512i)__builtin_ia32_pabsw512_mask((__v32hi)__A, (__v32hi)__W, + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_abs_epi16(__mmask32 __U, __m512i __A) { + return (__m512i)__builtin_ia32_pabsw512_mask( + (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpneq_epu8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, + (__mmask64)__M); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmplt_epu8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, + (__mmask64)__M); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpge_epu8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, + (__mmask64)__M); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmple_epu8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, + (__mmask64)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpneq_epu16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmplt_epu16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpge_epu16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmple_epu16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, + (__mmask32)__M); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpneq_epi8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, + (__mmask64)__M); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmplt_epi8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, + (__mmask64)__M); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpge_epi8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, + (__mmask64)__M); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmple_epi8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, + (__mmask64)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpneq_epi16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmplt_epi16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpge_epi16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmple_epi16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, + (__mmask32)__M); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpneq_epu8_mask(__m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmplt_epu8_mask(__m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpge_epu8_mask(__m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmple_epu8_mask(__m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, + (__mmask64)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpneq_epu16_mask(__m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmplt_epu16_mask(__m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpge_epu16_mask(__m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmple_epu16_mask(__m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, + (__mmask32)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpneq_epi8_mask(__m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmplt_epi8_mask(__m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpge_epi8_mask(__m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, + (__mmask64)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmple_epi8_mask(__m512i __X, __m512i __Y) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, + (__mmask64)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpneq_epi16_mask(__m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmplt_epi16_mask(__m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpge_epi16_mask(__m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmple_epi16_mask(__m512i __X, __m512i __Y) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_packs_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packssdw512_mask( + (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packssdw512_mask( + (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packssdw512_mask((__v16si)__A, (__v16si)__B, + (__v32hi)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_packus_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packusdw512_mask( + (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packusdw512_mask( + (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_packusdw512_mask((__v16si)__A, (__v16si)__B, + (__v32hi)__W, __M); +} + +#ifdef __OPTIMIZE__ +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kshiftli_mask32(__mmask32 __A, unsigned int __B) { + return (__mmask32)__builtin_ia32_kshiftlisi((__mmask32)__A, (__mmask8)__B); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kshiftli_mask64(__mmask64 __A, unsigned int __B) { + return (__mmask64)__builtin_ia32_kshiftlidi((__mmask64)__A, (__mmask8)__B); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kshiftri_mask32(__mmask32 __A, unsigned int __B) { + return (__mmask32)__builtin_ia32_kshiftrisi((__mmask32)__A, (__mmask8)__B); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kshiftri_mask64(__mmask64 __A, unsigned int __B) { + return (__mmask64)__builtin_ia32_kshiftridi((__mmask64)__A, (__mmask8)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_alignr_epi8(__m512i __A, __m512i __B, const int __N) { + return (__m512i)__builtin_ia32_palignr512((__v8di)__A, (__v8di)__B, __N * 8); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B, const int __N) { + return (__m512i)__builtin_ia32_palignr512_mask( + (__v8di)__A, (__v8di)__B, __N * 8, (__v8di)__W, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A, __m512i __B, + const int __N) { + return (__m512i)__builtin_ia32_palignr512_mask( + (__v8di)__A, (__v8di)__B, __N * 8, (__v8di)_mm512_setzero_si512(), + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_dbsad_epu8(__m512i __A, __m512i __B, const int __imm) { + return (__m512i)__builtin_ia32_dbpsadbw512_mask( + (__v64qi)__A, (__v64qi)__B, __imm, (__v32hi)_mm512_setzero_si512(), + (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B, + const int __imm) { + return (__m512i)__builtin_ia32_dbpsadbw512_mask( + (__v64qi)__A, (__v64qi)__B, __imm, (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B, + const int __imm) { + return (__m512i)__builtin_ia32_dbpsadbw512_mask( + (__v64qi)__A, (__v64qi)__B, __imm, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srli_epi16(__m512i __A, const int __imm) { + return (__m512i)__builtin_ia32_psrlwi512_mask( + (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) { + return (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)__A, __imm, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, const int __imm) { + return (__m512i)__builtin_ia32_psrlwi512_mask( + (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_slli_epi16(__m512i __A, const int __B) { + return (__m512i)__builtin_ia32_psllwi512_mask( + (__v32hi)__A, __B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, const int __B) { + return (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)__A, __B, (__v32hi)__W, + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, const int __B) { + return (__m512i)__builtin_ia32_psllwi512_mask( + (__v32hi)__A, __B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shufflehi_epi16(__m512i __A, const int __imm) { + return (__m512i)__builtin_ia32_pshufhw512_mask( + (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) { + return (__m512i)__builtin_ia32_pshufhw512_mask((__v32hi)__A, __imm, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A, const int __imm) { + return (__m512i)__builtin_ia32_pshufhw512_mask( + (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shufflelo_epi16(__m512i __A, const int __imm) { + return (__m512i)__builtin_ia32_pshuflw512_mask( + (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) { + return (__m512i)__builtin_ia32_pshuflw512_mask((__v32hi)__A, __imm, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A, const int __imm) { + return (__m512i)__builtin_ia32_pshuflw512_mask( + (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srai_epi16(__m512i __A, const int __imm) { + return (__m512i)__builtin_ia32_psrawi512_mask( + (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) { + return (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)__A, __imm, + (__v32hi)__W, (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, const int __imm) { + return (__m512i)__builtin_ia32_psrawi512_mask( + (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_blend_epi16(__mmask32 __U, __m512i __A, __m512i __W) { + return (__m512i)__builtin_ia32_blendmw_512_mask((__v32hi)__A, (__v32hi)__W, + (__mmask32)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_blend_epi8(__mmask64 __U, __m512i __A, __m512i __W) { + return (__m512i)__builtin_ia32_blendmb_512_mask((__v64qi)__A, (__v64qi)__W, + (__mmask64)__U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmp_epi16_mask(__mmask32 __U, __m512i __X, __m512i __Y, + const int __P) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, __P, + (__mmask32)__U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_epi16_mask(__m512i __X, __m512i __Y, const int __P) { + return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, __P, + (__mmask32)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmp_epi8_mask(__mmask64 __U, __m512i __X, __m512i __Y, + const int __P) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, __P, + (__mmask64)__U); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_epi8_mask(__m512i __X, __m512i __Y, const int __P) { + return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, __P, + (__mmask64)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmp_epu16_mask(__mmask32 __U, __m512i __X, __m512i __Y, + const int __P) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, + __P, (__mmask32)__U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_epu16_mask(__m512i __X, __m512i __Y, const int __P) { + return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, + __P, (__mmask32)-1); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmp_epu8_mask(__mmask64 __U, __m512i __X, __m512i __Y, + const int __P) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, + __P, (__mmask64)__U); +} + +extern __inline __mmask64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_epu8_mask(__m512i __X, __m512i __Y, const int __P) { + return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, + __P, (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_bslli_epi128(__m512i __A, const int __N) { + return (__m512i)__builtin_ia32_pslldq512(__A, __N * 8); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_bsrli_epi128(__m512i __A, const int __N) { + return (__m512i)__builtin_ia32_psrldq512(__A, __N * 8); +} + +#else +#define _kshiftli_mask32(X, Y) \ + ((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(X), (__mmask8)(Y))) + +#define _kshiftli_mask64(X, Y) \ + ((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(X), (__mmask8)(Y))) + +#define _kshiftri_mask32(X, Y) \ + ((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(X), (__mmask8)(Y))) + +#define _kshiftri_mask64(X, Y) \ + ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(X), (__mmask8)(Y))) + +#define _mm512_alignr_epi8(X, Y, N) \ + ((__m512i)__builtin_ia32_palignr512((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(N * 8))) + +#define _mm512_mask_alignr_epi8(W, U, X, Y, N) \ + ((__m512i)__builtin_ia32_palignr512_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(N * 8), \ + (__v8di)(__m512i)(W), (__mmask64)(U))) + +#define _mm512_maskz_alignr_epi8(U, X, Y, N) \ + ((__m512i)__builtin_ia32_palignr512_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(N * 8), \ + (__v8di)(__m512i)_mm512_setzero_si512(), (__mmask64)(U))) + +#define _mm512_dbsad_epu8(X, Y, C) \ + ((__m512i)__builtin_ia32_dbpsadbw512_mask( \ + (__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(C), \ + (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)-1)) + +#define _mm512_mask_dbsad_epu8(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_dbpsadbw512_mask( \ + (__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(C), \ + (__v32hi)(__m512i)(W), (__mmask32)(U))) + +#define _mm512_maskz_dbsad_epu8(U, X, Y, C) \ + ((__m512i)__builtin_ia32_dbpsadbw512_mask( \ + (__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(C), \ + (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)(U))) + +#define _mm512_srli_epi16(A, B) \ + ((__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)_mm512_setzero_si512(), \ + (__mmask32)-1)) + +#define _mm512_mask_srli_epi16(W, U, A, B) \ + ((__m512i)__builtin_ia32_psrlwi512_mask( \ + (__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) + +#define _mm512_maskz_srli_epi16(U, A, B) \ + ((__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)_mm512_setzero_si512(), \ + (__mmask32)(U))) + +#define _mm512_slli_epi16(X, C) \ + ((__m512i)__builtin_ia32_psllwi512_mask( \ + (__v32hi)(__m512i)(X), (int)(C), \ + (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)-1)) + +#define _mm512_mask_slli_epi16(W, U, X, C) \ + ((__m512i)__builtin_ia32_psllwi512_mask( \ + (__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)(W), (__mmask32)(U))) + +#define _mm512_maskz_slli_epi16(U, X, C) \ + ((__m512i)__builtin_ia32_psllwi512_mask( \ + (__v32hi)(__m512i)(X), (int)(C), \ + (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)(U))) + +#define _mm512_shufflehi_epi16(A, B) \ + ((__m512i)__builtin_ia32_pshufhw512_mask( \ + (__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)-1)) + +#define _mm512_mask_shufflehi_epi16(W, U, A, B) \ + ((__m512i)__builtin_ia32_pshufhw512_mask( \ + (__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) + +#define _mm512_maskz_shufflehi_epi16(U, A, B) \ + ((__m512i)__builtin_ia32_pshufhw512_mask( \ + (__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)(U))) + +#define _mm512_shufflelo_epi16(A, B) \ + ((__m512i)__builtin_ia32_pshuflw512_mask( \ + (__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)-1)) + +#define _mm512_mask_shufflelo_epi16(W, U, A, B) \ + ((__m512i)__builtin_ia32_pshuflw512_mask( \ + (__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) + +#define _mm512_maskz_shufflelo_epi16(U, A, B) \ + ((__m512i)__builtin_ia32_pshuflw512_mask( \ + (__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)(__m512i)_mm512_setzero_si512(), (__mmask32)(U))) + +#define _mm512_srai_epi16(A, B) \ + ((__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)_mm512_setzero_si512(), \ + (__mmask32)-1)) + +#define _mm512_mask_srai_epi16(W, U, A, B) \ + ((__m512i)__builtin_ia32_psrawi512_mask( \ + (__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) + +#define _mm512_maskz_srai_epi16(U, A, B) \ + ((__m512i)__builtin_ia32_psrawi512_mask((__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)_mm512_setzero_si512(), \ + (__mmask32)(U))) + +#define _mm512_mask_blend_epi16(__U, __A, __W) \ + ((__m512i)__builtin_ia32_blendmw_512_mask((__v32hi)(__A), (__v32hi)(__W), \ + (__mmask32)(__U))) + +#define _mm512_mask_blend_epi8(__U, __A, __W) \ + ((__m512i)__builtin_ia32_blendmb_512_mask((__v64qi)(__A), (__v64qi)(__W), \ + (__mmask64)(__U))) + +#define _mm512_cmp_epi16_mask(X, Y, P) \ + ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(X), \ + (__v32hi)(__m512i)(Y), (int)(P), \ + (__mmask32)(-1))) + +#define _mm512_cmp_epi8_mask(X, Y, P) \ + ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(X), \ + (__v64qi)(__m512i)(Y), (int)(P), \ + (__mmask64)(-1))) + +#define _mm512_cmp_epu16_mask(X, Y, P) \ + ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(X), \ + (__v32hi)(__m512i)(Y), (int)(P), \ + (__mmask32)(-1))) + +#define _mm512_cmp_epu8_mask(X, Y, P) \ + ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(X), \ + (__v64qi)(__m512i)(Y), (int)(P), \ + (__mmask64)(-1))) + +#define _mm512_mask_cmp_epi16_mask(M, X, Y, P) \ + ((__mmask32)__builtin_ia32_cmpw512_mask( \ + (__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(M))) + +#define _mm512_mask_cmp_epi8_mask(M, X, Y, P) \ + ((__mmask64)__builtin_ia32_cmpb512_mask( \ + (__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(M))) + +#define _mm512_mask_cmp_epu16_mask(M, X, Y, P) \ + ((__mmask32)__builtin_ia32_ucmpw512_mask( \ + (__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(M))) + +#define _mm512_mask_cmp_epu8_mask(M, X, Y, P) \ + ((__mmask64)__builtin_ia32_ucmpb512_mask( \ + (__v64qi)(__m512i)(X), (__v64qi)(__m512i)(Y), (int)(P), (__mmask64)(M))) + +#define _mm512_bslli_epi128(A, N) \ + ((__m512i)__builtin_ia32_pslldq512((__m512i)(A), (int)(N)*8)) + +#define _mm512_bsrli_epi128(A, N) \ + ((__m512i)__builtin_ia32_psrldq512((__m512i)(A), (int)(N)*8)) + +#endif + +#ifdef __DISABLE_AVX512BW__ +#undef __DISABLE_AVX512BW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BW__ */ + +#endif /* _AVX512BWINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512cdintrin.internal.h b/third_party/intel/avx512cdintrin.internal.h new file mode 100644 index 000000000..685cf2c46 --- /dev/null +++ b/third_party/intel/avx512cdintrin.internal.h @@ -0,0 +1,124 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512CDINTRIN_H_INCLUDED +#define _AVX512CDINTRIN_H_INCLUDED + +#ifndef __AVX512CD__ +#pragma GCC push_options +#pragma GCC target("avx512cd") +#define __DISABLE_AVX512CD__ +#endif /* __AVX512CD__ */ + +typedef long long __v8di __attribute__((__vector_size__(64))); +typedef int __v16si __attribute__((__vector_size__(64))); + +typedef long long __m512i __attribute__((__vector_size__(64), __may_alias__)); +typedef double __m512d __attribute__((__vector_size__(64), __may_alias__)); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_conflict_epi32(__m512i __A) { + return (__m512i)__builtin_ia32_vpconflictsi_512_mask( + (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_conflict_epi32(__m512i __W, __mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vpconflictsi_512_mask( + (__v16si)__A, (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_conflict_epi32(__mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vpconflictsi_512_mask( + (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_conflict_epi64(__m512i __A) { + return (__m512i)__builtin_ia32_vpconflictdi_512_mask( + (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_conflict_epi64(__m512i __W, __mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vpconflictdi_512_mask((__v8di)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_conflict_epi64(__mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vpconflictdi_512_mask( + (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_lzcnt_epi64(__m512i __A) { + return (__m512i)__builtin_ia32_vplzcntq_512_mask( + (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_lzcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vplzcntq_512_mask((__v8di)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vplzcntq_512_mask( + (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_lzcnt_epi32(__m512i __A) { + return (__m512i)__builtin_ia32_vplzcntd_512_mask( + (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_lzcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vplzcntd_512_mask((__v16si)__A, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_lzcnt_epi32(__mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vplzcntd_512_mask( + (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcastmb_epi64(__mmask8 __A) { + return (__m512i)__builtin_ia32_broadcastmb512(__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcastmw_epi32(__mmask16 __A) { + return (__m512i)__builtin_ia32_broadcastmw512(__A); +} + +#ifdef __DISABLE_AVX512CD__ +#undef __DISABLE_AVX512CD__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512CD__ */ + +#endif /* _AVX512CDINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512dqintrin.internal.h b/third_party/intel/avx512dqintrin.internal.h new file mode 100644 index 000000000..fcd0ed97d --- /dev/null +++ b/third_party/intel/avx512dqintrin.internal.h @@ -0,0 +1,2006 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512DQINTRIN_H_INCLUDED +#define _AVX512DQINTRIN_H_INCLUDED + +#ifndef __AVX512DQ__ +#pragma GCC push_options +#pragma GCC target("avx512dq") +#define __DISABLE_AVX512DQ__ +#endif /* __AVX512DQ__ */ + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__CF) { + *__CF = (unsigned char)__builtin_ia32_ktestcqi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) { + return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) { + return (unsigned char)__builtin_ia32_ktestcqi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__CF) { + *__CF = (unsigned char)__builtin_ia32_ktestchi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) { + return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) { + return (unsigned char)__builtin_ia32_ktestchi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__CF) { + *__CF = (unsigned char)__builtin_ia32_kortestcqi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) { + return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) { + return (unsigned char)__builtin_ia32_kortestcqi(__A, __B); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kadd_mask8(__mmask8 __A, __mmask8 __B) { + return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kadd_mask16(__mmask16 __A, __mmask16 __B) { + return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cvtmask8_u32(__mmask8 __A) { + return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cvtu32_mask8(unsigned int __A) { + return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _load_mask8(__mmask8 *__A) { + return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _store_mask8(__mmask8 *__A, __mmask8 __B) { + *(__mmask8 *)__A = __builtin_ia32_kmovb(__B); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _knot_mask8(__mmask8 __A) { + return (__mmask8)__builtin_ia32_knotqi((__mmask8)__A); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kor_mask8(__mmask8 __A, __mmask8 __B) { + return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kxnor_mask8(__mmask8 __A, __mmask8 __B) { + return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kxor_mask8(__mmask8 __A, __mmask8 __B) { + return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kand_mask8(__mmask8 __A, __mmask8 __B) { + return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kandn_mask8(__mmask8 __A, __mmask8 __B) { + return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcast_f64x2(__m128d __A) { + return (__m512d)__builtin_ia32_broadcastf64x2_512_mask( + (__v2df)__A, _mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) { + return (__m512d)__builtin_ia32_broadcastf64x2_512_mask((__v2df)__A, + (__v8df)__O, __M); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { + return (__m512d)__builtin_ia32_broadcastf64x2_512_mask( + (__v2df)__A, (__v8df)_mm512_setzero_ps(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcast_i64x2(__m128i __A) { + return (__m512i)__builtin_ia32_broadcasti64x2_512_mask( + (__v2di)__A, _mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) { + return (__m512i)__builtin_ia32_broadcasti64x2_512_mask((__v2di)__A, + (__v8di)__O, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { + return (__m512i)__builtin_ia32_broadcasti64x2_512_mask( + (__v2di)__A, (__v8di)_mm512_setzero_si512(), __M); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcast_f32x2(__m128 __A) { + return (__m512)__builtin_ia32_broadcastf32x2_512_mask( + (__v4sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) { + return (__m512)__builtin_ia32_broadcastf32x2_512_mask((__v4sf)__A, + (__v16sf)__O, __M); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) { + return (__m512)__builtin_ia32_broadcastf32x2_512_mask( + (__v4sf)__A, (__v16sf)_mm512_setzero_ps(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcast_i32x2(__m128i __A) { + return (__m512i)__builtin_ia32_broadcasti32x2_512_mask( + (__v4si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) { + return (__m512i)__builtin_ia32_broadcasti32x2_512_mask((__v4si)__A, + (__v16si)__O, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) { + return (__m512i)__builtin_ia32_broadcasti32x2_512_mask( + (__v4si)__A, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcast_f32x8(__m256 __A) { + return (__m512)__builtin_ia32_broadcastf32x8_512_mask( + (__v8sf)__A, _mm512_undefined_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) { + return (__m512)__builtin_ia32_broadcastf32x8_512_mask((__v8sf)__A, + (__v16sf)__O, __M); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { + return (__m512)__builtin_ia32_broadcastf32x8_512_mask( + (__v8sf)__A, (__v16sf)_mm512_setzero_ps(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcast_i32x8(__m256i __A) { + return (__m512i)__builtin_ia32_broadcasti32x8_512_mask( + (__v8si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) { + return (__m512i)__builtin_ia32_broadcasti32x8_512_mask((__v8si)__A, + (__v16si)__O, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { + return (__m512i)__builtin_ia32_broadcasti32x8_512_mask( + (__v8si)__A, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mullo_epi64(__m512i __A, __m512i __B) { + return (__m512i)((__v8du)__A * (__v8du)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmullq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmullq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_xor_pd(__m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_xorpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_xorpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_xorpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_xor_ps(__m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_xorps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_xorps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_xorps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_or_pd(__m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_orpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_orpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_orpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_or_ps(__m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_orps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_orps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_orps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_and_pd(__m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_andpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_andpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_andpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_and_ps(__m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_andps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_andps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_andps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_andnot_pd(__m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_andnpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_andnpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_andnpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_andnot_ps(__m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_andnps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_andnps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_andnps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_movepi32_mask(__m512i __A) { + return (__mmask16)__builtin_ia32_cvtd2mask512((__v16si)__A); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_movepi64_mask(__m512i __A) { + return (__mmask8)__builtin_ia32_cvtq2mask512((__v8di)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_movm_epi32(__mmask16 __A) { + return (__m512i)__builtin_ia32_cvtmask2d512(__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_movm_epi64(__mmask8 __A) { + return (__m512i)__builtin_ia32_cvtmask2q512(__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvttpd_epi64(__m512d __A) { + return (__m512i)__builtin_ia32_cvttpd2qq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvttpd_epi64(__m512i __W, __mmask8 __U, __m512d __A) { + return (__m512i)__builtin_ia32_cvttpd2qq512_mask( + (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvttpd_epi64(__mmask8 __U, __m512d __A) { + return (__m512i)__builtin_ia32_cvttpd2qq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvttpd_epu64(__m512d __A) { + return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvttpd_epu64(__m512i __W, __mmask8 __U, __m512d __A) { + return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( + (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvttpd_epu64(__mmask8 __U, __m512d __A) { + return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvttps_epi64(__m256 __A) { + return (__m512i)__builtin_ia32_cvttps2qq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvttps_epi64(__m512i __W, __mmask8 __U, __m256 __A) { + return (__m512i)__builtin_ia32_cvttps2qq512_mask( + (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvttps_epi64(__mmask8 __U, __m256 __A) { + return (__m512i)__builtin_ia32_cvttps2qq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvttps_epu64(__m256 __A) { + return (__m512i)__builtin_ia32_cvttps2uqq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvttps_epu64(__m512i __W, __mmask8 __U, __m256 __A) { + return (__m512i)__builtin_ia32_cvttps2uqq512_mask( + (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvttps_epu64(__mmask8 __U, __m256 __A) { + return (__m512i)__builtin_ia32_cvttps2uqq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtpd_epi64(__m512d __A) { + return (__m512i)__builtin_ia32_cvtpd2qq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtpd_epi64(__m512i __W, __mmask8 __U, __m512d __A) { + return (__m512i)__builtin_ia32_cvtpd2qq512_mask( + (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtpd_epi64(__mmask8 __U, __m512d __A) { + return (__m512i)__builtin_ia32_cvtpd2qq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtpd_epu64(__m512d __A) { + return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtpd_epu64(__m512i __W, __mmask8 __U, __m512d __A) { + return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( + (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtpd_epu64(__mmask8 __U, __m512d __A) { + return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtps_epi64(__m256 __A) { + return (__m512i)__builtin_ia32_cvtps2qq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtps_epi64(__m512i __W, __mmask8 __U, __m256 __A) { + return (__m512i)__builtin_ia32_cvtps2qq512_mask( + (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtps_epi64(__mmask8 __U, __m256 __A) { + return (__m512i)__builtin_ia32_cvtps2qq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtps_epu64(__m256 __A) { + return (__m512i)__builtin_ia32_cvtps2uqq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtps_epu64(__m512i __W, __mmask8 __U, __m256 __A) { + return (__m512i)__builtin_ia32_cvtps2uqq512_mask( + (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtps_epu64(__mmask8 __U, __m256 __A) { + return (__m512i)__builtin_ia32_cvtps2uqq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi64_ps(__m512i __A) { + return (__m256)__builtin_ia32_cvtqq2ps512_mask( + (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi64_ps(__m256 __W, __mmask8 __U, __m512i __A) { + return (__m256)__builtin_ia32_cvtqq2ps512_mask( + (__v8di)__A, (__v8sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi64_ps(__mmask8 __U, __m512i __A) { + return (__m256)__builtin_ia32_cvtqq2ps512_mask( + (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu64_ps(__m512i __A) { + return (__m256)__builtin_ia32_cvtuqq2ps512_mask( + (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepu64_ps(__m256 __W, __mmask8 __U, __m512i __A) { + return (__m256)__builtin_ia32_cvtuqq2ps512_mask( + (__v8di)__A, (__v8sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepu64_ps(__mmask8 __U, __m512i __A) { + return (__m256)__builtin_ia32_cvtuqq2ps512_mask( + (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi64_pd(__m512i __A) { + return (__m512d)__builtin_ia32_cvtqq2pd512_mask( + (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi64_pd(__m512d __W, __mmask8 __U, __m512i __A) { + return (__m512d)__builtin_ia32_cvtqq2pd512_mask( + (__v8di)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi64_pd(__mmask8 __U, __m512i __A) { + return (__m512d)__builtin_ia32_cvtqq2pd512_mask( + (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu64_pd(__m512i __A) { + return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( + (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepu64_pd(__m512d __W, __mmask8 __U, __m512i __A) { + return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( + (__v8di)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepu64_pd(__mmask8 __U, __m512i __A) { + return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( + (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kshiftli_mask8(__mmask8 __A, unsigned int __B) { + return (__mmask8)__builtin_ia32_kshiftliqi((__mmask8)__A, (__mmask8)__B); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kshiftri_mask8(__mmask8 __A, unsigned int __B) { + return (__mmask8)__builtin_ia32_kshiftriqi((__mmask8)__A, (__mmask8)__B); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_range_pd(__m512d __A, __m512d __B, int __C) { + return (__m512d)__builtin_ia32_rangepd512_mask( + (__v8df)__A, (__v8df)__B, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_range_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B, + int __C) { + return (__m512d)__builtin_ia32_rangepd512_mask((__v8df)__A, (__v8df)__B, __C, + (__v8df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_range_pd(__mmask8 __U, __m512d __A, __m512d __B, int __C) { + return (__m512d)__builtin_ia32_rangepd512_mask( + (__v8df)__A, (__v8df)__B, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_range_ps(__m512 __A, __m512 __B, int __C) { + return (__m512)__builtin_ia32_rangeps512_mask( + (__v16sf)__A, (__v16sf)__B, __C, (__v16sf)_mm512_setzero_ps(), + (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_range_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, + int __C) { + return (__m512)__builtin_ia32_rangeps512_mask((__v16sf)__A, (__v16sf)__B, __C, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_range_ps(__mmask16 __U, __m512 __A, __m512 __B, int __C) { + return (__m512)__builtin_ia32_rangeps512_mask( + (__v16sf)__A, (__v16sf)__B, __C, (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_reduce_sd(__m128d __A, __m128d __B, int __C) { + return (__m128d)__builtin_ia32_reducesd_mask( + (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_reduce_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + int __C) { + return (__m128d)__builtin_ia32_reducesd_mask((__v2df)__A, (__v2df)__B, __C, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_reduce_sd(__mmask8 __U, __m128d __A, __m128d __B, int __C) { + return (__m128d)__builtin_ia32_reducesd_mask( + (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_reduce_ss(__m128 __A, __m128 __B, int __C) { + return (__m128)__builtin_ia32_reducess_mask( + (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_reduce_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C) { + return (__m128)__builtin_ia32_reducess_mask((__v4sf)__A, (__v4sf)__B, __C, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_reduce_ss(__mmask8 __U, __m128 __A, __m128 __B, int __C) { + return (__m128)__builtin_ia32_reducess_mask( + (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_range_sd(__m128d __A, __m128d __B, int __C) { + return (__m128d)__builtin_ia32_rangesd128_mask_round( + (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_range_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + int __C) { + return (__m128d)__builtin_ia32_rangesd128_mask_round( + (__v2df)__A, (__v2df)__B, __C, (__v2df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_range_sd(__mmask8 __U, __m128d __A, __m128d __B, int __C) { + return (__m128d)__builtin_ia32_rangesd128_mask_round( + (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_range_ss(__m128 __A, __m128 __B, int __C) { + return (__m128)__builtin_ia32_rangess128_mask_round( + (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_range_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C) { + return (__m128)__builtin_ia32_rangess128_mask_round( + (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_range_ss(__mmask8 __U, __m128 __A, __m128 __B, int __C) { + return (__m128)__builtin_ia32_rangess128_mask_round( + (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_range_round_sd(__m128d __A, __m128d __B, int __C, const int __R) { + return (__m128d)__builtin_ia32_rangesd128_mask_round( + (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1, + __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_range_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + int __C, const int __R) { + return (__m128d)__builtin_ia32_rangesd128_mask_round( + (__v2df)__A, (__v2df)__B, __C, (__v2df)__W, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_range_round_sd(__mmask8 __U, __m128d __A, __m128d __B, int __C, + const int __R) { + return (__m128d)__builtin_ia32_rangesd128_mask_round( + (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_range_round_ss(__m128 __A, __m128 __B, int __C, const int __R) { + return (__m128)__builtin_ia32_rangess128_mask_round( + (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, + __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_range_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + int __C, const int __R) { + return (__m128)__builtin_ia32_rangess128_mask_round( + (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)__W, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_range_round_ss(__mmask8 __U, __m128 __A, __m128 __B, int __C, + const int __R) { + return (__m128)__builtin_ia32_rangess128_mask_round( + (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + __R); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fpclass_ss_mask(__m128 __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclassss((__v4sf)__A, __imm); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fpclass_sd_mask(__m128d __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclasssd((__v2df)__A, __imm); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtt_roundpd_epi64(__m512d __A, const int __R) { + return (__m512i)__builtin_ia32_cvttpd2qq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtt_roundpd_epi64(__m512i __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)__A, (__v8di)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtt_roundpd_epi64(__mmask8 __U, __m512d __A, const int __R) { + return (__m512i)__builtin_ia32_cvttpd2qq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtt_roundpd_epu64(__m512d __A, const int __R) { + return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtt_roundpd_epu64(__m512i __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)__A, (__v8di)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtt_roundpd_epu64(__mmask8 __U, __m512d __A, const int __R) { + return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtt_roundps_epi64(__m256 __A, const int __R) { + return (__m512i)__builtin_ia32_cvttps2qq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtt_roundps_epi64(__m512i __W, __mmask8 __U, __m256 __A, + const int __R) { + return (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)__A, (__v8di)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtt_roundps_epi64(__mmask8 __U, __m256 __A, const int __R) { + return (__m512i)__builtin_ia32_cvttps2qq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtt_roundps_epu64(__m256 __A, const int __R) { + return (__m512i)__builtin_ia32_cvttps2uqq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtt_roundps_epu64(__m512i __W, __mmask8 __U, __m256 __A, + const int __R) { + return (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)__A, (__v8di)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtt_roundps_epu64(__mmask8 __U, __m256 __A, const int __R) { + return (__m512i)__builtin_ia32_cvttps2uqq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundpd_epi64(__m512d __A, const int __R) { + return (__m512i)__builtin_ia32_cvtpd2qq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundpd_epi64(__m512i __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)__A, (__v8di)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundpd_epi64(__mmask8 __U, __m512d __A, const int __R) { + return (__m512i)__builtin_ia32_cvtpd2qq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundpd_epu64(__m512d __A, const int __R) { + return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundpd_epu64(__m512i __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)__A, (__v8di)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundpd_epu64(__mmask8 __U, __m512d __A, const int __R) { + return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( + (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundps_epi64(__m256 __A, const int __R) { + return (__m512i)__builtin_ia32_cvtps2qq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundps_epi64(__m512i __W, __mmask8 __U, __m256 __A, + const int __R) { + return (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)__A, (__v8di)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundps_epi64(__mmask8 __U, __m256 __A, const int __R) { + return (__m512i)__builtin_ia32_cvtps2qq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundps_epu64(__m256 __A, const int __R) { + return (__m512i)__builtin_ia32_cvtps2uqq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundps_epu64(__m512i __W, __mmask8 __U, __m256 __A, + const int __R) { + return (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)__A, (__v8di)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundps_epu64(__mmask8 __U, __m256 __A, const int __R) { + return (__m512i)__builtin_ia32_cvtps2uqq512_mask( + (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundepi64_ps(__m512i __A, const int __R) { + return (__m256)__builtin_ia32_cvtqq2ps512_mask( + (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, __R); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundepi64_ps(__m256 __W, __mmask8 __U, __m512i __A, + const int __R) { + return (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)__A, (__v8sf)__W, + (__mmask8)__U, __R); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundepi64_ps(__mmask8 __U, __m512i __A, const int __R) { + return (__m256)__builtin_ia32_cvtqq2ps512_mask( + (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, __R); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundepu64_ps(__m512i __A, const int __R) { + return (__m256)__builtin_ia32_cvtuqq2ps512_mask( + (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, __R); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundepu64_ps(__m256 __W, __mmask8 __U, __m512i __A, + const int __R) { + return (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)__A, (__v8sf)__W, + (__mmask8)__U, __R); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundepu64_ps(__mmask8 __U, __m512i __A, const int __R) { + return (__m256)__builtin_ia32_cvtuqq2ps512_mask( + (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundepi64_pd(__m512i __A, const int __R) { + return (__m512d)__builtin_ia32_cvtqq2pd512_mask( + (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundepi64_pd(__m512d __W, __mmask8 __U, __m512i __A, + const int __R) { + return (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)__A, (__v8df)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundepi64_pd(__mmask8 __U, __m512i __A, const int __R) { + return (__m512d)__builtin_ia32_cvtqq2pd512_mask( + (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundepu64_pd(__m512i __A, const int __R) { + return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( + (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundepu64_pd(__m512d __W, __mmask8 __U, __m512i __A, + const int __R) { + return (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)__A, (__v8df)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundepu64_pd(__mmask8 __U, __m512i __A, const int __R) { + return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( + (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_pd(__m512d __A, int __B) { + return (__m512d)__builtin_ia32_reducepd512_mask( + (__v8df)__A, __B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_pd(__m512d __W, __mmask8 __U, __m512d __A, int __B) { + return (__m512d)__builtin_ia32_reducepd512_mask((__v8df)__A, __B, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_reduce_pd(__mmask8 __U, __m512d __A, int __B) { + return (__m512d)__builtin_ia32_reducepd512_mask( + (__v8df)__A, __B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_ps(__m512 __A, int __B) { + return (__m512)__builtin_ia32_reduceps512_mask( + (__v16sf)__A, __B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_ps(__m512 __W, __mmask16 __U, __m512 __A, int __B) { + return (__m512)__builtin_ia32_reduceps512_mask((__v16sf)__A, __B, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_reduce_ps(__mmask16 __U, __m512 __A, int __B) { + return (__m512)__builtin_ia32_reduceps512_mask( + (__v16sf)__A, __B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_extractf32x8_ps(__m512 __A, const int __imm) { + return (__m256)__builtin_ia32_extractf32x8_mask( + (__v16sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A, + const int __imm) { + return (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)__A, __imm, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A, const int __imm) { + return (__m256)__builtin_ia32_extractf32x8_mask( + (__v16sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_extractf64x2_pd(__m512d __A, const int __imm) { + return (__m128d)__builtin_ia32_extractf64x2_512_mask( + (__v8df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A, + const int __imm) { + return (__m128d)__builtin_ia32_extractf64x2_512_mask( + (__v8df)__A, __imm, (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A, const int __imm) { + return (__m128d)__builtin_ia32_extractf64x2_512_mask( + (__v8df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_extracti32x8_epi32(__m512i __A, const int __imm) { + return (__m256i)__builtin_ia32_extracti32x8_mask( + (__v16si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A, + const int __imm) { + return (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)__A, __imm, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A, const int __imm) { + return (__m256i)__builtin_ia32_extracti32x8_mask( + (__v16si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_extracti64x2_epi64(__m512i __A, const int __imm) { + return (__m128i)__builtin_ia32_extracti64x2_512_mask( + (__v8di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A, + const int __imm) { + return (__m128i)__builtin_ia32_extracti64x2_512_mask( + (__v8di)__A, __imm, (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A, const int __imm) { + return (__m128i)__builtin_ia32_extracti64x2_512_mask( + (__v8di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_range_round_pd(__m512d __A, __m512d __B, int __C, const int __R) { + return (__m512d)__builtin_ia32_rangepd512_mask((__v8df)__A, (__v8df)__B, __C, + (__v8df)_mm512_setzero_pd(), + (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_range_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, int __C, const int __R) { + return (__m512d)__builtin_ia32_rangepd512_mask( + (__v8df)__A, (__v8df)__B, __C, (__v8df)__W, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_range_round_pd(__mmask8 __U, __m512d __A, __m512d __B, int __C, + const int __R) { + return (__m512d)__builtin_ia32_rangepd512_mask((__v8df)__A, (__v8df)__B, __C, + (__v8df)_mm512_setzero_pd(), + (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_range_round_ps(__m512 __A, __m512 __B, int __C, const int __R) { + return (__m512)__builtin_ia32_rangeps512_mask((__v16sf)__A, (__v16sf)__B, __C, + (__v16sf)_mm512_setzero_ps(), + (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_range_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, int __C, const int __R) { + return (__m512)__builtin_ia32_rangeps512_mask( + (__v16sf)__A, (__v16sf)__B, __C, (__v16sf)__W, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_range_round_ps(__mmask16 __U, __m512 __A, __m512 __B, int __C, + const int __R) { + return (__m512)__builtin_ia32_rangeps512_mask((__v16sf)__A, (__v16sf)__B, __C, + (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_inserti32x8(__m512i __A, __m256i __B, const int __imm) { + return (__m512i)__builtin_ia32_inserti32x8_mask( + (__v16si)__A, (__v8si)__B, __imm, (__v16si)_mm512_setzero_si512(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_inserti32x8(__m512i __W, __mmask16 __U, __m512i __A, + __m256i __B, const int __imm) { + return (__m512i)__builtin_ia32_inserti32x8_mask( + (__v16si)__A, (__v8si)__B, __imm, (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_inserti32x8(__mmask16 __U, __m512i __A, __m256i __B, + const int __imm) { + return (__m512i)__builtin_ia32_inserti32x8_mask( + (__v16si)__A, (__v8si)__B, __imm, (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_insertf32x8(__m512 __A, __m256 __B, const int __imm) { + return (__m512)__builtin_ia32_insertf32x8_mask( + (__v16sf)__A, (__v8sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), + (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_insertf32x8(__m512 __W, __mmask16 __U, __m512 __A, __m256 __B, + const int __imm) { + return (__m512)__builtin_ia32_insertf32x8_mask( + (__v16sf)__A, (__v8sf)__B, __imm, (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_insertf32x8(__mmask16 __U, __m512 __A, __m256 __B, + const int __imm) { + return (__m512)__builtin_ia32_insertf32x8_mask( + (__v16sf)__A, (__v8sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_inserti64x2(__m512i __A, __m128i __B, const int __imm) { + return (__m512i)__builtin_ia32_inserti64x2_512_mask( + (__v8di)__A, (__v2di)__B, __imm, (__v8di)_mm512_setzero_si512(), + (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_inserti64x2(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B, + const int __imm) { + return (__m512i)__builtin_ia32_inserti64x2_512_mask( + (__v8di)__A, (__v2di)__B, __imm, (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_inserti64x2(__mmask8 __U, __m512i __A, __m128i __B, + const int __imm) { + return (__m512i)__builtin_ia32_inserti64x2_512_mask( + (__v8di)__A, (__v2di)__B, __imm, (__v8di)_mm512_setzero_si512(), + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_insertf64x2(__m512d __A, __m128d __B, const int __imm) { + return (__m512d)__builtin_ia32_insertf64x2_512_mask( + (__v8df)__A, (__v2df)__B, __imm, (__v8df)_mm512_setzero_pd(), + (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_insertf64x2(__m512d __W, __mmask8 __U, __m512d __A, __m128d __B, + const int __imm) { + return (__m512d)__builtin_ia32_insertf64x2_512_mask( + (__v8df)__A, (__v2df)__B, __imm, (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_insertf64x2(__mmask8 __U, __m512d __A, __m128d __B, + const int __imm) { + return (__m512d)__builtin_ia32_insertf64x2_512_mask( + (__v8df)__A, (__v2df)__B, __imm, (__v8df)_mm512_setzero_pd(), + (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fpclass_pd_mask(__mmask8 __U, __m512d __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)__A, __imm, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fpclass_pd_mask(__m512d __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)__A, __imm, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fpclass_ps_mask(__mmask16 __U, __m512 __A, const int __imm) { + return (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)__A, __imm, __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fpclass_ps_mask(__m512 __A, const int __imm) { + return (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)__A, __imm, + (__mmask16)-1); +} + +#else +#define _kshiftli_mask8(X, Y) \ + ((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(X), (__mmask8)(Y))) + +#define _kshiftri_mask8(X, Y) \ + ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(X), (__mmask8)(Y))) + +#define _mm_range_sd(A, B, C) \ + ((__m128d)__builtin_ia32_rangesd128_mask_round( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_range_sd(W, U, A, B, C) \ + ((__m128d)__builtin_ia32_rangesd128_mask_round( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_range_sd(U, A, B, C) \ + ((__m128d)__builtin_ia32_rangesd128_mask_round( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_range_ss(A, B, C) \ + ((__m128)__builtin_ia32_rangess128_mask_round( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_range_ss(W, U, A, B, C) \ + ((__m128)__builtin_ia32_rangess128_mask_round( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_range_ss(U, A, B, C) \ + ((__m128)__builtin_ia32_rangess128_mask_round( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_range_round_sd(A, B, C, R) \ + ((__m128d)__builtin_ia32_rangesd128_mask_round( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)-1, (R))) + +#define _mm_mask_range_round_sd(W, U, A, B, C, R) \ + ((__m128d)__builtin_ia32_rangesd128_mask_round( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), (__mmask8)(U), (R))) + +#define _mm_maskz_range_round_sd(U, A, B, C, R) \ + ((__m128d)__builtin_ia32_rangesd128_mask_round( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)(U), (R))) + +#define _mm_range_round_ss(A, B, C, R) \ + ((__m128)__builtin_ia32_rangess128_mask_round( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (R))) + +#define _mm_mask_range_round_ss(W, U, A, B, C, R) \ + ((__m128)__builtin_ia32_rangess128_mask_round( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \ + (__mmask8)(U), (R))) + +#define _mm_maskz_range_round_ss(U, A, B, C, R) \ + ((__m128)__builtin_ia32_rangess128_mask_round( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)(U), (R))) + +#define _mm512_cvtt_roundpd_epi64(A, B) \ + ((__m512i)__builtin_ia32_cvttpd2qq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), -1, (B))) + +#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttpd2qq512_mask((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvtt_roundpd_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_cvttpd2qq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), (U), (B))) + +#define _mm512_cvtt_roundpd_epu64(A, B) \ + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), -1, (B))) + +#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvtt_roundpd_epu64(U, A, B) \ + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), (U), (B))) + +#define _mm512_cvtt_roundps_epi64(A, B) \ + ((__m512i)__builtin_ia32_cvttps2qq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), -1, (B))) + +#define _mm512_mask_cvtt_roundps_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2qq512_mask((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvtt_roundps_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2qq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), (U), (B))) + +#define _mm512_cvtt_roundps_epu64(A, B) \ + ((__m512i)__builtin_ia32_cvttps2uqq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), -1, (B))) + +#define _mm512_mask_cvtt_roundps_epu64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2uqq512_mask((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvtt_roundps_epu64(U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2uqq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), (U), (B))) + +#define _mm512_cvt_roundpd_epi64(A, B) \ + ((__m512i)__builtin_ia32_cvtpd2qq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), -1, (B))) + +#define _mm512_mask_cvt_roundpd_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtpd2qq512_mask((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvt_roundpd_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_cvtpd2qq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), (U), (B))) + +#define _mm512_cvt_roundpd_epu64(A, B) \ + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), -1, (B))) + +#define _mm512_mask_cvt_roundpd_epu64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvt_roundpd_epu64(U, A, B) \ + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), (U), (B))) + +#define _mm512_cvt_roundps_epi64(A, B) \ + ((__m512i)__builtin_ia32_cvtps2qq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), -1, (B))) + +#define _mm512_mask_cvt_roundps_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2qq512_mask((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvt_roundps_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2qq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), (U), (B))) + +#define _mm512_cvt_roundps_epu64(A, B) \ + ((__m512i)__builtin_ia32_cvtps2uqq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), -1, (B))) + +#define _mm512_mask_cvt_roundps_epu64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2uqq512_mask((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvt_roundps_epu64(U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2uqq512_mask( \ + (A), (__v8di)_mm512_setzero_si512(), (U), (B))) + +#define _mm512_cvt_roundepi64_ps(A, B) \ + ((__m256)__builtin_ia32_cvtqq2ps512_mask( \ + (__v8di)(A), (__v8sf)_mm256_setzero_ps(), -1, (B))) + +#define _mm512_mask_cvt_roundepi64_ps(W, U, A, B) \ + ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(A), (W), (U), (B))) + +#define _mm512_maskz_cvt_roundepi64_ps(U, A, B) \ + ((__m256)__builtin_ia32_cvtqq2ps512_mask( \ + (__v8di)(A), (__v8sf)_mm256_setzero_ps(), (U), (B))) + +#define _mm512_cvt_roundepu64_ps(A, B) \ + ((__m256)__builtin_ia32_cvtuqq2ps512_mask( \ + (__v8di)(A), (__v8sf)_mm256_setzero_ps(), -1, (B))) + +#define _mm512_mask_cvt_roundepu64_ps(W, U, A, B) \ + ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(A), (W), (U), (B))) + +#define _mm512_maskz_cvt_roundepu64_ps(U, A, B) \ + ((__m256)__builtin_ia32_cvtuqq2ps512_mask( \ + (__v8di)(A), (__v8sf)_mm256_setzero_ps(), (U), (B))) + +#define _mm512_cvt_roundepi64_pd(A, B) \ + ((__m512d)__builtin_ia32_cvtqq2pd512_mask( \ + (__v8di)(A), (__v8df)_mm512_setzero_pd(), -1, (B))) + +#define _mm512_mask_cvt_roundepi64_pd(W, U, A, B) \ + ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(A), (W), (U), (B))) + +#define _mm512_maskz_cvt_roundepi64_pd(U, A, B) \ + ((__m512d)__builtin_ia32_cvtqq2pd512_mask( \ + (__v8di)(A), (__v8df)_mm512_setzero_pd(), (U), (B))) + +#define _mm512_cvt_roundepu64_pd(A, B) \ + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask( \ + (__v8di)(A), (__v8df)_mm512_setzero_pd(), -1, (B))) + +#define _mm512_mask_cvt_roundepu64_pd(W, U, A, B) \ + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(A), (W), (U), (B))) + +#define _mm512_maskz_cvt_roundepu64_pd(U, A, B) \ + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask( \ + (__v8di)(A), (__v8df)_mm512_setzero_pd(), (U), (B))) + +#define _mm512_reduce_pd(A, B) \ + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm512_mask_reduce_pd(W, U, A, B) \ + ((__m512d)__builtin_ia32_reducepd512_mask( \ + (__v8df)(__m512d)(A), (int)(B), (__v8df)(__m512d)(W), (__mmask8)(U))) + +#define _mm512_maskz_reduce_pd(U, A, B) \ + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_reduce_ps(A, B) \ + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1)) + +#define _mm512_mask_reduce_ps(W, U, A, B) \ + ((__m512)__builtin_ia32_reduceps512_mask( \ + (__v16sf)(__m512)(A), (int)(B), (__v16sf)(__m512)(W), (__mmask16)(U))) + +#define _mm512_maskz_reduce_ps(U, A, B) \ + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U))) + +#define _mm512_extractf32x8_ps(X, C) \ + ((__m256)__builtin_ia32_extractf32x8_mask( \ + (__v16sf)(__m512)(X), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm512_mask_extractf32x8_ps(W, U, X, C) \ + ((__m256)__builtin_ia32_extractf32x8_mask( \ + (__v16sf)(__m512)(X), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm512_maskz_extractf32x8_ps(U, X, C) \ + ((__m256)__builtin_ia32_extractf32x8_mask( \ + (__v16sf)(__m512)(X), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm512_extractf64x2_pd(X, C) \ + ((__m128d)__builtin_ia32_extractf64x2_512_mask( \ + (__v8df)(__m512d)(X), (int)(C), (__v2df)(__m128d)_mm_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm512_mask_extractf64x2_pd(W, U, X, C) \ + ((__m128d)__builtin_ia32_extractf64x2_512_mask( \ + (__v8df)(__m512d)(X), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm512_maskz_extractf64x2_pd(U, X, C) \ + ((__m128d)__builtin_ia32_extractf64x2_512_mask( \ + (__v8df)(__m512d)(X), (int)(C), (__v2df)(__m128d)_mm_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_extracti32x8_epi32(X, C) \ + ((__m256i)__builtin_ia32_extracti32x8_mask( \ + (__v16si)(__m512i)(X), (int)(C), \ + (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)-1)) + +#define _mm512_mask_extracti32x8_epi32(W, U, X, C) \ + ((__m256i)__builtin_ia32_extracti32x8_mask( \ + (__v16si)(__m512i)(X), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm512_maskz_extracti32x8_epi32(U, X, C) \ + ((__m256i)__builtin_ia32_extracti32x8_mask( \ + (__v16si)(__m512i)(X), (int)(C), \ + (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) + +#define _mm512_extracti64x2_epi64(X, C) \ + ((__m128i)__builtin_ia32_extracti64x2_512_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128(), \ + (__mmask8)-1)) + +#define _mm512_mask_extracti64x2_epi64(W, U, X, C) \ + ((__m128i)__builtin_ia32_extracti64x2_512_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v2di)(__m128i)(W), (__mmask8)(U))) + +#define _mm512_maskz_extracti64x2_epi64(U, X, C) \ + ((__m128i)__builtin_ia32_extracti64x2_512_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm512_range_pd(A, B, C) \ + ((__m512d)__builtin_ia32_rangepd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_range_pd(W, U, A, B, C) \ + ((__m512d)__builtin_ia32_rangepd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_range_pd(U, A, B, C) \ + ((__m512d)__builtin_ia32_rangepd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_range_ps(A, B, C) \ + ((__m512)__builtin_ia32_rangeps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_range_ps(W, U, A, B, C) \ + ((__m512)__builtin_ia32_rangeps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_range_ps(U, A, B, C) \ + ((__m512)__builtin_ia32_rangeps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_range_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_rangepd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)-1, (R))) + +#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \ + ((__m512d)__builtin_ia32_rangepd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U), (R))) + +#define _mm512_maskz_range_round_pd(U, A, B, C, R) \ + ((__m512d)__builtin_ia32_rangepd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (R))) + +#define _mm512_range_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_rangeps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, (R))) + +#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \ + ((__m512)__builtin_ia32_rangeps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U), (R))) + +#define _mm512_maskz_range_round_ps(U, A, B, C, R) \ + ((__m512)__builtin_ia32_rangeps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (R))) + +#define _mm512_insertf64x2(X, Y, C) \ + ((__m512d)__builtin_ia32_insertf64x2_512_mask( \ + (__v8df)(__m512d)(X), (__v2df)(__m128d)(Y), (int)(C), \ + (__v8df)(__m512d)(X), (__mmask8)-1)) + +#define _mm512_mask_insertf64x2(W, U, X, Y, C) \ + ((__m512d)__builtin_ia32_insertf64x2_512_mask( \ + (__v8df)(__m512d)(X), (__v2df)(__m128d)(Y), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U))) + +#define _mm512_maskz_insertf64x2(U, X, Y, C) \ + ((__m512d)__builtin_ia32_insertf64x2_512_mask( \ + (__v8df)(__m512d)(X), (__v2df)(__m128d)(Y), (int)(C), \ + (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) + +#define _mm512_inserti64x2(X, Y, C) \ + ((__m512i)__builtin_ia32_inserti64x2_512_mask( \ + (__v8di)(__m512i)(X), (__v2di)(__m128i)(Y), (int)(C), \ + (__v8di)(__m512i)(X), (__mmask8)-1)) + +#define _mm512_mask_inserti64x2(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_inserti64x2_512_mask( \ + (__v8di)(__m512i)(X), (__v2di)(__m128i)(Y), (int)(C), \ + (__v8di)(__m512i)(W), (__mmask8)(U))) + +#define _mm512_maskz_inserti64x2(U, X, Y, C) \ + ((__m512i)__builtin_ia32_inserti64x2_512_mask( \ + (__v8di)(__m512i)(X), (__v2di)(__m128i)(Y), (int)(C), \ + (__v8di)(__m512i)_mm512_setzero_si512(), (__mmask8)(U))) + +#define _mm512_insertf32x8(X, Y, C) \ + ((__m512)__builtin_ia32_insertf32x8_mask( \ + (__v16sf)(__m512)(X), (__v8sf)(__m256)(Y), (int)(C), \ + (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)-1)) + +#define _mm512_mask_insertf32x8(W, U, X, Y, C) \ + ((__m512)__builtin_ia32_insertf32x8_mask( \ + (__v16sf)(__m512)(X), (__v8sf)(__m256)(Y), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U))) + +#define _mm512_maskz_insertf32x8(U, X, Y, C) \ + ((__m512)__builtin_ia32_insertf32x8_mask( \ + (__v16sf)(__m512)(X), (__v8sf)(__m256)(Y), (int)(C), \ + (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) + +#define _mm512_inserti32x8(X, Y, C) \ + ((__m512i)__builtin_ia32_inserti32x8_mask( \ + (__v16si)(__m512i)(X), (__v8si)(__m256i)(Y), (int)(C), \ + (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)-1)) + +#define _mm512_mask_inserti32x8(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_inserti32x8_mask( \ + (__v16si)(__m512i)(X), (__v8si)(__m256i)(Y), (int)(C), \ + (__v16si)(__m512i)(W), (__mmask16)(U))) + +#define _mm512_maskz_inserti32x8(U, X, Y, C) \ + ((__m512i)__builtin_ia32_inserti32x8_mask( \ + (__v16si)(__m512i)(X), (__v8si)(__m256i)(Y), (int)(C), \ + (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) + +#define _mm_fpclass_ss_mask(X, C) \ + ((__mmask8)__builtin_ia32_fpclassss((__v4sf)(__m128)(X), (int)(C))) + +#define _mm_fpclass_sd_mask(X, C) \ + ((__mmask8)__builtin_ia32_fpclasssd((__v2df)(__m128d)(X), (int)(C))) + +#define _mm512_mask_fpclass_pd_mask(u, X, C) \ + ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(X), (int)(C), \ + (__mmask8)(u))) + +#define _mm512_mask_fpclass_ps_mask(u, x, c) \ + ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(x), (int)(c), \ + (__mmask8)(u))) + +#define _mm512_fpclass_pd_mask(X, C) \ + ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(X), (int)(C), \ + (__mmask8)-1)) + +#define _mm512_fpclass_ps_mask(x, c) \ + ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(x), (int)(c), \ + (__mmask8)-1)) + +#define _mm_reduce_sd(A, B, C) \ + ((__m128d)__builtin_ia32_reducesd_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)-1)) + +#define _mm_mask_reduce_sd(W, U, A, B, C) \ + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_reduce_sd(U, A, B, C) \ + ((__m128d)__builtin_ia32_reducesd_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)(U))) + +#define _mm_reduce_ss(A, B, C) \ + ((__m128)__builtin_ia32_reducess_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) + +#define _mm_mask_reduce_ss(W, U, A, B, C) \ + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_reduce_ss(U, A, B, C) \ + ((__m128)__builtin_ia32_reducess_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) + +#endif + +#ifdef __DISABLE_AVX512DQ__ +#undef __DISABLE_AVX512DQ__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512DQ__ */ + +#endif /* _AVX512DQINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512erintrin.internal.h b/third_party/intel/avx512erintrin.internal.h new file mode 100644 index 000000000..e50df746c --- /dev/null +++ b/third_party/intel/avx512erintrin.internal.h @@ -0,0 +1,314 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512ERINTRIN_H_INCLUDED +#define _AVX512ERINTRIN_H_INCLUDED + +#ifndef __AVX512ER__ +#pragma GCC push_options +#pragma GCC target("avx512er") +#define __DISABLE_AVX512ER__ +#endif /* __AVX512ER__ */ + +typedef double __v8df __attribute__((__vector_size__(64))); +typedef float __v16sf __attribute__((__vector_size__(64))); + +typedef float __m512 __attribute__((__vector_size__(64), __may_alias__)); +typedef double __m512d __attribute__((__vector_size__(64), __may_alias__)); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +#ifdef __OPTIMIZE__ +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_exp2a23_round_pd(__m512d __A, int __R) { + __m512d __W; + return (__m512d)__builtin_ia32_exp2pd_mask((__v8df)__A, (__v8df)__W, + (__mmask8)-1, __R); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_exp2a23_round_pd(__m512d __W, __mmask8 __U, __m512d __A, int __R) { + return (__m512d)__builtin_ia32_exp2pd_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_exp2a23_round_pd(__mmask8 __U, __m512d __A, int __R) { + return (__m512d)__builtin_ia32_exp2pd_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_exp2a23_round_ps(__m512 __A, int __R) { + __m512 __W; + return (__m512)__builtin_ia32_exp2ps_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)-1, __R); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_exp2a23_round_ps(__m512 __W, __mmask16 __U, __m512 __A, int __R) { + return (__m512)__builtin_ia32_exp2ps_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_exp2a23_round_ps(__mmask16 __U, __m512 __A, int __R) { + return (__m512)__builtin_ia32_exp2ps_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rcp28_round_pd(__m512d __A, int __R) { + __m512d __W; + return (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)__A, (__v8df)__W, + (__mmask8)-1, __R); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_rcp28_round_pd(__m512d __W, __mmask8 __U, __m512d __A, int __R) { + return (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rcp28_round_pd(__mmask8 __U, __m512d __A, int __R) { + return (__m512d)__builtin_ia32_rcp28pd_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rcp28_round_ps(__m512 __A, int __R) { + __m512 __W; + return (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_rcp28_round_ps(__m512 __W, __mmask16 __U, __m512 __A, int __R) { + return (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rcp28_round_ps(__mmask16 __U, __m512 __A, int __R) { + return (__m512)__builtin_ia32_rcp28ps_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp28_round_sd(__m128d __A, __m128d __B, int __R) { + return (__m128d)__builtin_ia32_rcp28sd_round((__v2df)__B, (__v2df)__A, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp28_round_ss(__m128 __A, __m128 __B, int __R) { + return (__m128)__builtin_ia32_rcp28ss_round((__v4sf)__B, (__v4sf)__A, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rsqrt28_round_pd(__m512d __A, int __R) { + __m512d __W; + return (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)__A, (__v8df)__W, + (__mmask8)-1, __R); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_rsqrt28_round_pd(__m512d __W, __mmask8 __U, __m512d __A, int __R) { + return (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rsqrt28_round_pd(__mmask8 __U, __m512d __A, int __R) { + return (__m512d)__builtin_ia32_rsqrt28pd_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rsqrt28_round_ps(__m512 __A, int __R) { + __m512 __W; + return (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)-1, __R); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_rsqrt28_round_ps(__m512 __W, __mmask16 __U, __m512 __A, int __R) { + return (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rsqrt28_round_ps(__mmask16 __U, __m512 __A, int __R) { + return (__m512)__builtin_ia32_rsqrt28ps_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt28_round_sd(__m128d __A, __m128d __B, int __R) { + return (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)__B, (__v2df)__A, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt28_round_ss(__m128 __A, __m128 __B, int __R) { + return (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)__B, (__v4sf)__A, __R); +} + +#else +#define _mm512_exp2a23_round_pd(A, C) \ + __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_exp2a23_round_pd(W, U, A, C) \ + __builtin_ia32_exp2pd_mask(A, W, U, C) + +#define _mm512_maskz_exp2a23_round_pd(U, A, C) \ + __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_exp2a23_round_ps(A, C) \ + __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_exp2a23_round_ps(W, U, A, C) \ + __builtin_ia32_exp2ps_mask(A, W, U, C) + +#define _mm512_maskz_exp2a23_round_ps(U, A, C) \ + __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_rcp28_round_pd(A, C) \ + __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_rcp28_round_pd(W, U, A, C) \ + __builtin_ia32_rcp28pd_mask(A, W, U, C) + +#define _mm512_maskz_rcp28_round_pd(U, A, C) \ + __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_rcp28_round_ps(A, C) \ + __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_rcp28_round_ps(W, U, A, C) \ + __builtin_ia32_rcp28ps_mask(A, W, U, C) + +#define _mm512_maskz_rcp28_round_ps(U, A, C) \ + __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_rsqrt28_round_pd(A, C) \ + __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_rsqrt28_round_pd(W, U, A, C) \ + __builtin_ia32_rsqrt28pd_mask(A, W, U, C) + +#define _mm512_maskz_rsqrt28_round_pd(U, A, C) \ + __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_rsqrt28_round_ps(A, C) \ + __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_rsqrt28_round_ps(W, U, A, C) \ + __builtin_ia32_rsqrt28ps_mask(A, W, U, C) + +#define _mm512_maskz_rsqrt28_round_ps(U, A, C) \ + __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm_rcp28_round_sd(A, B, R) __builtin_ia32_rcp28sd_round(A, B, R) + +#define _mm_rcp28_round_ss(A, B, R) __builtin_ia32_rcp28ss_round(A, B, R) + +#define _mm_rsqrt28_round_sd(A, B, R) __builtin_ia32_rsqrt28sd_round(A, B, R) + +#define _mm_rsqrt28_round_ss(A, B, R) __builtin_ia32_rsqrt28ss_round(A, B, R) + +#endif + +#define _mm512_exp2a23_pd(A) \ + _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_pd(W, U, A) \ + _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_pd(U, A) \ + _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_exp2a23_ps(A) \ + _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_ps(W, U, A) \ + _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_ps(U, A) \ + _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rcp28_pd(A) _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_pd(W, U, A) \ + _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_pd(U, A) \ + _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rcp28_ps(A) _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_ps(W, U, A) \ + _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_ps(U, A) \ + _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rsqrt28_pd(A) \ + _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_pd(W, U, A) \ + _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_pd(U, A) \ + _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rsqrt28_ps(A) \ + _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_ps(W, U, A) \ + _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_ps(U, A) \ + _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm_rcp28_sd(A, B) \ + __builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm_rcp28_ss(A, B) \ + __builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm_rsqrt28_sd(A, B) \ + __builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm_rsqrt28_ss(A, B) \ + __builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) + +#ifdef __DISABLE_AVX512ER__ +#undef __DISABLE_AVX512ER__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512ER__ */ + +#endif /* _AVX512ERINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512fintrin.internal.h b/third_party/intel/avx512fintrin.internal.h new file mode 100644 index 000000000..f7d7eeeb5 --- /dev/null +++ b/third_party/intel/avx512fintrin.internal.h @@ -0,0 +1,12519 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512FINTRIN_H_INCLUDED +#define _AVX512FINTRIN_H_INCLUDED + +#ifndef __AVX512F__ +#pragma GCC push_options +#pragma GCC target("avx512f") +#define __DISABLE_AVX512F__ +#endif /* __AVX512F__ */ + +typedef double __v8df __attribute__((__vector_size__(64))); +typedef float __v16sf __attribute__((__vector_size__(64))); +typedef long long __v8di __attribute__((__vector_size__(64))); +typedef unsigned long long __v8du __attribute__((__vector_size__(64))); +typedef int __v16si __attribute__((__vector_size__(64))); +typedef unsigned int __v16su __attribute__((__vector_size__(64))); +typedef short __v32hi __attribute__((__vector_size__(64))); +typedef unsigned short __v32hu __attribute__((__vector_size__(64))); +typedef char __v64qi __attribute__((__vector_size__(64))); +typedef unsigned char __v64qu __attribute__((__vector_size__(64))); + +typedef float __m512 __attribute__((__vector_size__(64), __may_alias__)); +typedef long long __m512i __attribute__((__vector_size__(64), __may_alias__)); +typedef double __m512d __attribute__((__vector_size__(64), __may_alias__)); + +typedef float __m512_u + __attribute__((__vector_size__(64), __may_alias__, __aligned__(1))); +typedef long long __m512i_u + __attribute__((__vector_size__(64), __may_alias__, __aligned__(1))); +typedef double __m512d_u + __attribute__((__vector_size__(64), __may_alias__, __aligned__(1))); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_int2mask(int __M) { + return (__mmask16)__M; +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask2int(__mmask16 __M) { + return (int)__M; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set_epi64(long long __A, long long __B, long long __C, long long __D, + long long __E, long long __F, long long __G, + long long __H) { + return __extension__(__m512i)(__v8di){__H, __G, __F, __E, __D, __C, __B, __A}; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set_epi32(int __A, int __B, int __C, int __D, int __E, int __F, + int __G, int __H, int __I, int __J, int __K, int __L, + int __M, int __N, int __O, int __P) { + return __extension__(__m512i)(__v16si){__P, __O, __N, __M, __L, __K, + __J, __I, __H, __G, __F, __E, + __D, __C, __B, __A}; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set_epi16(short __q31, short __q30, short __q29, short __q28, + short __q27, short __q26, short __q25, short __q24, + short __q23, short __q22, short __q21, short __q20, + short __q19, short __q18, short __q17, short __q16, + short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) { + return __extension__(__m512i)(__v32hi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31}; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60, char __q59, + char __q58, char __q57, char __q56, char __q55, char __q54, + char __q53, char __q52, char __q51, char __q50, char __q49, + char __q48, char __q47, char __q46, char __q45, char __q44, + char __q43, char __q42, char __q41, char __q40, char __q39, + char __q38, char __q37, char __q36, char __q35, char __q34, + char __q33, char __q32, char __q31, char __q30, char __q29, + char __q28, char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, char __q19, + char __q18, char __q17, char __q16, char __q15, char __q14, + char __q13, char __q12, char __q11, char __q10, char __q09, + char __q08, char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) { + return __extension__(__m512i)(__v64qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, __q08, __q09, + __q10, __q11, __q12, __q13, __q14, __q15, __q16, __q17, __q18, __q19, + __q20, __q21, __q22, __q23, __q24, __q25, __q26, __q27, __q28, __q29, + __q30, __q31, __q32, __q33, __q34, __q35, __q36, __q37, __q38, __q39, + __q40, __q41, __q42, __q43, __q44, __q45, __q46, __q47, __q48, __q49, + __q50, __q51, __q52, __q53, __q54, __q55, __q56, __q57, __q58, __q59, + __q60, __q61, __q62, __q63}; +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set_pd(double __A, double __B, double __C, double __D, double __E, + double __F, double __G, double __H) { + return __extension__(__m512d){__H, __G, __F, __E, __D, __C, __B, __A}; +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set_ps(float __A, float __B, float __C, float __D, float __E, + float __F, float __G, float __H, float __I, float __J, + float __K, float __L, float __M, float __N, float __O, + float __P) { + return __extension__(__m512){__P, __O, __N, __M, __L, __K, __J, __I, + __H, __G, __F, __E, __D, __C, __B, __A}; +} + +#define _mm512_setr_epi64(e0, e1, e2, e3, e4, e5, e6, e7) \ + _mm512_set_epi64(e7, e6, e5, e4, e3, e2, e1, e0) + +#define _mm512_setr_epi32(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, \ + e12, e13, e14, e15) \ + _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, \ + e2, e1, e0) + +#define _mm512_setr_pd(e0, e1, e2, e3, e4, e5, e6, e7) \ + _mm512_set_pd(e7, e6, e5, e4, e3, e2, e1, e0) + +#define _mm512_setr_ps(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, \ + e13, e14, e15) \ + _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, \ + e1, e0) + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_undefined_ps(void) { + __m512 __Y = __Y; + return __Y; +} + +#define _mm512_undefined _mm512_undefined_ps + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_undefined_pd(void) { + __m512d __Y = __Y; + return __Y; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_undefined_epi32(void) { + __m512i __Y = __Y; + return __Y; +} + +#define _mm512_undefined_si512 _mm512_undefined_epi32 + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set1_epi8(char __A) { + return __extension__(__m512i)(__v64qi){ + __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A}; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set1_epi16(short __A) { + return __extension__(__m512i)(__v32hi){ + __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, __A, __A}; +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set1_pd(double __A) { + return (__m512d)__builtin_ia32_broadcastsd512( + __extension__(__v2df){ + __A, + }, + (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set1_ps(float __A) { + return (__m512)__builtin_ia32_broadcastss512( + __extension__(__v4sf){ + __A, + }, + (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set4_epi32(int __A, int __B, int __C, int __D) { + return __extension__(__m512i)(__v16si){__D, __C, __B, __A, __D, __C, + __B, __A, __D, __C, __B, __A, + __D, __C, __B, __A}; +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_set4_epi64(long long __A, long long __B, long long __C, long long __D) { + return __extension__(__m512i)(__v8di){__D, __C, __B, __A, __D, __C, __B, __A}; +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set4_pd(double __A, double __B, double __C, double __D) { + return __extension__(__m512d){__D, __C, __B, __A, __D, __C, __B, __A}; +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set4_ps(float __A, float __B, float __C, float __D) { + return __extension__(__m512){__D, __C, __B, __A, __D, __C, __B, __A, + __D, __C, __B, __A, __D, __C, __B, __A}; +} + +#define _mm512_setr4_epi64(e0, e1, e2, e3) _mm512_set4_epi64(e3, e2, e1, e0) + +#define _mm512_setr4_epi32(e0, e1, e2, e3) _mm512_set4_epi32(e3, e2, e1, e0) + +#define _mm512_setr4_pd(e0, e1, e2, e3) _mm512_set4_pd(e3, e2, e1, e0) + +#define _mm512_setr4_ps(e0, e1, e2, e3) _mm512_set4_ps(e3, e2, e1, e0) + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_setzero_ps(void) { + return __extension__(__m512){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_setzero(void) { + return _mm512_setzero_ps(); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_setzero_pd(void) { + return __extension__(__m512d){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_setzero_epi32(void) { + return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0}; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_setzero_si512(void) { + return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0}; +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_movapd512_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mov_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_movapd512_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mov_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_movaps512_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_movaps512_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_load_pd(void const *__P) { + return *(__m512d *)__P; +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_load_pd(__m512d __W, __mmask8 __U, void const *__P) { + return (__m512d)__builtin_ia32_loadapd512_mask((const __v8df *)__P, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_load_pd(__mmask8 __U, void const *__P) { + return (__m512d)__builtin_ia32_loadapd512_mask( + (const __v8df *)__P, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_store_pd(void *__P, __m512d __A) { + *(__m512d *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) { + __builtin_ia32_storeapd512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_load_ps(void const *__P) { + return *(__m512 *)__P; +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_load_ps(__m512 __W, __mmask16 __U, void const *__P) { + return (__m512)__builtin_ia32_loadaps512_mask((const __v16sf *)__P, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_load_ps(__mmask16 __U, void const *__P) { + return (__m512)__builtin_ia32_loadaps512_mask( + (const __v16sf *)__P, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_store_ps(void *__P, __m512 __A) { + *(__m512 *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) { + __builtin_ia32_storeaps512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mov_epi64(__m512i __W, __mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_movdqa64_512_mask((__v8di)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mov_epi64(__mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_movdqa64_512_mask( + (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_load_epi64(void const *__P) { + return *(__m512i *)__P; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_load_epi64(__m512i __W, __mmask8 __U, void const *__P) { + return (__m512i)__builtin_ia32_movdqa64load512_mask( + (const __v8di *)__P, (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_load_epi64(__mmask8 __U, void const *__P) { + return (__m512i)__builtin_ia32_movdqa64load512_mask( + (const __v8di *)__P, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_store_epi64(void *__P, __m512i __A) { + *(__m512i *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_store_epi64(void *__P, __mmask8 __U, __m512i __A) { + __builtin_ia32_movdqa64store512_mask((__v8di *)__P, (__v8di)__A, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mov_epi32(__m512i __W, __mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_movdqa32_512_mask((__v16si)__A, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mov_epi32(__mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_movdqa32_512_mask( + (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_load_si512(void const *__P) { + return *(__m512i *)__P; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_load_epi32(void const *__P) { + return *(__m512i *)__P; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_load_epi32(__m512i __W, __mmask16 __U, void const *__P) { + return (__m512i)__builtin_ia32_movdqa32load512_mask( + (const __v16si *)__P, (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_load_epi32(__mmask16 __U, void const *__P) { + return (__m512i)__builtin_ia32_movdqa32load512_mask( + (const __v16si *)__P, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_store_si512(void *__P, __m512i __A) { + *(__m512i *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_store_epi32(void *__P, __m512i __A) { + *(__m512i *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_store_epi32(void *__P, __mmask16 __U, __m512i __A) { + __builtin_ia32_movdqa32store512_mask((__v16si *)__P, (__v16si)__A, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mullo_epi32(__m512i __A, __m512i __B) { + return (__m512i)((__v16su)__A * (__v16su)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulld512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmulld512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mullox_epi64(__m512i __A, __m512i __B) { + return (__m512i)((__v8du)__A * (__v8du)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_mullox_epi64(__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { + return _mm512_mask_mov_epi64(__W, __M, _mm512_mullox_epi64(__A, __B)); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sllv_epi32(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psllv16si_mask( + (__v16si)__X, (__v16si)__Y, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psllv16si_mask((__v16si)__X, (__v16si)__Y, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psllv16si_mask((__v16si)__X, (__v16si)__Y, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srav_epi32(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrav16si_mask( + (__v16si)__X, (__v16si)__Y, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrav16si_mask((__v16si)__X, (__v16si)__Y, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrav16si_mask((__v16si)__X, (__v16si)__Y, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srlv_epi32(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrlv16si_mask( + (__v16si)__X, (__v16si)__Y, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrlv16si_mask((__v16si)__X, (__v16si)__Y, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrlv16si_mask((__v16si)__X, (__v16si)__Y, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_add_epi64(__m512i __A, __m512i __B) { + return (__m512i)((__v8du)__A + (__v8du)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sub_epi64(__m512i __A, __m512i __B) { + return (__m512i)((__v8du)__A - (__v8du)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sllv_epi64(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psllv8di_mask( + (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psllv8di_mask((__v8di)__X, (__v8di)__Y, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psllv8di_mask( + (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srav_epi64(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrav8di_mask( + (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrav8di_mask((__v8di)__X, (__v8di)__Y, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrav8di_mask( + (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srlv_epi64(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrlv8di_mask( + (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrlv8di_mask((__v8di)__X, (__v8di)__Y, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_psrlv8di_mask( + (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_add_epi32(__m512i __A, __m512i __B) { + return (__m512i)((__v16su)__A + (__v16su)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_add_epi32(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_paddd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mul_epi32(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_pmuldq512_mask( + (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_undefined_epi32(), + (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_pmuldq512_mask((__v16si)__X, (__v16si)__Y, + (__v8di)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_pmuldq512_mask( + (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sub_epi32(__m512i __A, __m512i __B) { + return (__m512i)((__v16su)__A - (__v16su)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_psubd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mul_epu32(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_pmuludq512_mask( + (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_undefined_epi32(), + (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_pmuludq512_mask((__v16si)__X, (__v16si)__Y, + (__v8di)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_pmuludq512_mask( + (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_setzero_si512(), __M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_slli_epi64(__m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_psllqi512_mask( + (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) { + return (__m512i)__builtin_ia32_psllqi512_mask((__v8di)__A, __B, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_psllqi512_mask( + (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} +#else +#define _mm512_slli_epi64(X, C) \ + ((__m512i)__builtin_ia32_psllqi512_mask( \ + (__v8di)(__m512i)(X), (int)(C), \ + (__v8di)(__m512i)_mm512_undefined_epi32(), (__mmask8)-1)) + +#define _mm512_mask_slli_epi64(W, U, X, C) \ + ((__m512i)__builtin_ia32_psllqi512_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) + +#define _mm512_maskz_slli_epi64(U, X, C) \ + ((__m512i)__builtin_ia32_psllqi512_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512(), \ + (__mmask8)(U))) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sll_epi64(__m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psllq512_mask( + (__v8di)__A, (__v2di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psllq512_mask((__v8di)__A, (__v2di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psllq512_mask( + (__v8di)__A, (__v2di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srli_epi64(__m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_psrlqi512_mask( + (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) { + return (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)__A, __B, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_psrlqi512_mask( + (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} +#else +#define _mm512_srli_epi64(X, C) \ + ((__m512i)__builtin_ia32_psrlqi512_mask( \ + (__v8di)(__m512i)(X), (int)(C), \ + (__v8di)(__m512i)_mm512_undefined_epi32(), (__mmask8)-1)) + +#define _mm512_mask_srli_epi64(W, U, X, C) \ + ((__m512i)__builtin_ia32_psrlqi512_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) + +#define _mm512_maskz_srli_epi64(U, X, C) \ + ((__m512i)__builtin_ia32_psrlqi512_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512(), \ + (__mmask8)(U))) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srl_epi64(__m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrlq512_mask( + (__v8di)__A, (__v2di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrlq512_mask((__v8di)__A, (__v2di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrlq512_mask( + (__v8di)__A, (__v2di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srai_epi64(__m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_psraqi512_mask( + (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) { + return (__m512i)__builtin_ia32_psraqi512_mask((__v8di)__A, __B, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_psraqi512_mask( + (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} +#else +#define _mm512_srai_epi64(X, C) \ + ((__m512i)__builtin_ia32_psraqi512_mask( \ + (__v8di)(__m512i)(X), (int)(C), \ + (__v8di)(__m512i)_mm512_undefined_epi32(), (__mmask8)-1)) + +#define _mm512_mask_srai_epi64(W, U, X, C) \ + ((__m512i)__builtin_ia32_psraqi512_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) + +#define _mm512_maskz_srai_epi64(U, X, C) \ + ((__m512i)__builtin_ia32_psraqi512_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v8di)(__m512i)_mm512_setzero_si512(), \ + (__mmask8)(U))) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sra_epi64(__m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psraq512_mask( + (__v8di)__A, (__v2di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psraq512_mask((__v8di)__A, (__v2di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psraq512_mask( + (__v8di)__A, (__v2di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_slli_epi32(__m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_pslldi512_mask( + (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) { + return (__m512i)__builtin_ia32_pslldi512_mask((__v16si)__A, __B, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_pslldi512_mask( + (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} +#else +#define _mm512_slli_epi32(X, C) \ + ((__m512i)__builtin_ia32_pslldi512_mask( \ + (__v16si)(__m512i)(X), (int)(C), \ + (__v16si)(__m512i)_mm512_undefined_epi32(), (__mmask16)-1)) + +#define _mm512_mask_slli_epi32(W, U, X, C) \ + ((__m512i)__builtin_ia32_pslldi512_mask( \ + (__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) + +#define _mm512_maskz_slli_epi32(U, X, C) \ + ((__m512i)__builtin_ia32_pslldi512_mask( \ + (__v16si)(__m512i)(X), (int)(C), \ + (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sll_epi32(__m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_pslld512_mask( + (__v16si)__A, (__v4si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_pslld512_mask((__v16si)__A, (__v4si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_pslld512_mask((__v16si)__A, (__v4si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srli_epi32(__m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_psrldi512_mask( + (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) { + return (__m512i)__builtin_ia32_psrldi512_mask((__v16si)__A, __B, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_psrldi512_mask( + (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} +#else +#define _mm512_srli_epi32(X, C) \ + ((__m512i)__builtin_ia32_psrldi512_mask( \ + (__v16si)(__m512i)(X), (int)(C), \ + (__v16si)(__m512i)_mm512_undefined_epi32(), (__mmask16)-1)) + +#define _mm512_mask_srli_epi32(W, U, X, C) \ + ((__m512i)__builtin_ia32_psrldi512_mask( \ + (__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) + +#define _mm512_maskz_srli_epi32(U, X, C) \ + ((__m512i)__builtin_ia32_psrldi512_mask( \ + (__v16si)(__m512i)(X), (int)(C), \ + (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srl_epi32(__m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrld512_mask( + (__v16si)__A, (__v4si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrld512_mask((__v16si)__A, (__v4si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrld512_mask((__v16si)__A, (__v4si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_srai_epi32(__m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_psradi512_mask( + (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) { + return (__m512i)__builtin_ia32_psradi512_mask((__v16si)__A, __B, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_psradi512_mask( + (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} +#else +#define _mm512_srai_epi32(X, C) \ + ((__m512i)__builtin_ia32_psradi512_mask( \ + (__v16si)(__m512i)(X), (int)(C), \ + (__v16si)(__m512i)_mm512_undefined_epi32(), (__mmask16)-1)) + +#define _mm512_mask_srai_epi32(W, U, X, C) \ + ((__m512i)__builtin_ia32_psradi512_mask( \ + (__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) + +#define _mm512_maskz_srai_epi32(U, X, C) \ + ((__m512i)__builtin_ia32_psradi512_mask( \ + (__v16si)(__m512i)(X), (int)(C), \ + (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sra_epi32(__m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrad512_mask( + (__v16si)__A, (__v4si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrad512_mask((__v16si)__A, (__v4si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) { + return (__m512i)__builtin_ia32_psrad512_mask((__v16si)__A, (__v4si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_round_sd(__m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_addsd_round((__v2df)__A, (__v2df)__B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_add_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_addsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_add_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_addsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_round_ss(__m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_addss_round((__v4sf)__A, (__v4sf)__B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_add_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_addss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_add_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_addss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_round_sd(__m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_subsd_round((__v2df)__A, (__v2df)__B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_subsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_sub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_subsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_round_ss(__m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_subss_round((__v4sf)__A, (__v4sf)__B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_subss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_sub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_subss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +} + +#else +#define _mm_add_round_sd(A, B, C) (__m128d) __builtin_ia32_addsd_round(A, B, C) + +#define _mm_mask_add_round_sd(W, U, A, B, C) \ + (__m128d) __builtin_ia32_addsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_add_round_sd(U, A, B, C) \ + (__m128d) \ + __builtin_ia32_addsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_add_round_ss(A, B, C) (__m128) __builtin_ia32_addss_round(A, B, C) + +#define _mm_mask_add_round_ss(W, U, A, B, C) \ + (__m128) __builtin_ia32_addss_mask_round(A, B, W, U, C) + +#define _mm_maskz_add_round_ss(U, A, B, C) \ + (__m128) __builtin_ia32_addss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_sub_round_sd(A, B, C) (__m128d) __builtin_ia32_subsd_round(A, B, C) + +#define _mm_mask_sub_round_sd(W, U, A, B, C) \ + (__m128d) __builtin_ia32_subsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_sub_round_sd(U, A, B, C) \ + (__m128d) \ + __builtin_ia32_subsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_sub_round_ss(A, B, C) (__m128) __builtin_ia32_subss_round(A, B, C) + +#define _mm_mask_sub_round_ss(W, U, A, B, C) \ + (__m128) __builtin_ia32_subss_mask_round(A, B, W, U, C) + +#define _mm_maskz_sub_round_ss(U, A, B, C) \ + (__m128) __builtin_ia32_subss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_ternarylogic_epi64(__m512i __A, __m512i __B, __m512i __C, + const int __imm) { + return (__m512i)__builtin_ia32_pternlogq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)__C, __imm, (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, __m512i __B, + __m512i __C, const int __imm) { + return (__m512i)__builtin_ia32_pternlogq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)__C, __imm, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, __m512i __B, + __m512i __C, const int __imm) { + return (__m512i)__builtin_ia32_pternlogq512_maskz( + (__v8di)__A, (__v8di)__B, (__v8di)__C, __imm, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_ternarylogic_epi32(__m512i __A, __m512i __B, __m512i __C, + const int __imm) { + return (__m512i)__builtin_ia32_pternlogd512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)__C, __imm, (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, const int __imm) { + return (__m512i)__builtin_ia32_pternlogd512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)__C, __imm, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, const int __imm) { + return (__m512i)__builtin_ia32_pternlogd512_maskz( + (__v16si)__A, (__v16si)__B, (__v16si)__C, __imm, (__mmask16)__U); +} +#else +#define _mm512_ternarylogic_epi64(A, B, C, I) \ + ((__m512i)__builtin_ia32_pternlogq512_mask( \ + (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ + (int)(I), (__mmask8)-1)) +#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I) \ + ((__m512i)__builtin_ia32_pternlogq512_mask( \ + (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ + (int)(I), (__mmask8)(U))) +#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I) \ + ((__m512i)__builtin_ia32_pternlogq512_maskz( \ + (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \ + (int)(I), (__mmask8)(U))) +#define _mm512_ternarylogic_epi32(A, B, C, I) \ + ((__m512i)__builtin_ia32_pternlogd512_mask( \ + (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ + (int)(I), (__mmask16)-1)) +#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I) \ + ((__m512i)__builtin_ia32_pternlogd512_mask( \ + (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ + (int)(I), (__mmask16)(U))) +#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I) \ + ((__m512i)__builtin_ia32_pternlogd512_maskz( \ + (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \ + (int)(I), (__mmask16)(U))) +#endif + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rcp14_pd(__m512d __A) { + return (__m512d)__builtin_ia32_rcp14pd512_mask( + (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_rcp14_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_rcp14pd512_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rcp14_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_rcp14pd512_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rcp14_ps(__m512 __A) { + return (__m512)__builtin_ia32_rcp14ps512_mask( + (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_rcp14_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_rcp14ps512_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rcp14_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_rcp14ps512_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp14_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_rcp14sd((__v2df)__B, (__v2df)__A); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rcp14_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_rcp14sd_mask((__v2df)__B, (__v2df)__A, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rcp14_sd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_rcp14sd_mask( + (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp14_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_rcp14ss((__v4sf)__B, (__v4sf)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rcp14_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_rcp14ss_mask((__v4sf)__B, (__v4sf)__A, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rcp14_ss(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_rcp14ss_mask( + (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rsqrt14_pd(__m512d __A) { + return (__m512d)__builtin_ia32_rsqrt14pd512_mask( + (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_rsqrt14_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_rsqrt14pd512_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rsqrt14_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_rsqrt14pd512_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rsqrt14_ps(__m512 __A) { + return (__m512)__builtin_ia32_rsqrt14ps512_mask( + (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_rsqrt14_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_rsqrt14ps512_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rsqrt14_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_rsqrt14ps512_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt14_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_rsqrt14sd((__v2df)__B, (__v2df)__A); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rsqrt14_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_rsqrt14sd_mask((__v2df)__B, (__v2df)__A, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rsqrt14_sd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_rsqrt14sd_mask( + (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt14_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_rsqrt14ss((__v4sf)__B, (__v4sf)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rsqrt14_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_rsqrt14ss_mask((__v4sf)__B, (__v4sf)__A, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rsqrt14_ss(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_rsqrt14ss_mask( + (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sqrt_round_pd(__m512d __A, const int __R) { + return (__m512d)__builtin_ia32_sqrtpd512_mask( + (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sqrt_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sqrt_round_pd(__mmask8 __U, __m512d __A, const int __R) { + return (__m512d)__builtin_ia32_sqrtpd512_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sqrt_round_ps(__m512 __A, const int __R) { + return (__m512)__builtin_ia32_sqrtps512_mask( + (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sqrt_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + const int __R) { + return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sqrt_round_ps(__mmask16 __U, __m512 __A, const int __R) { + return (__m512)__builtin_ia32_sqrtps512_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_round_sd(__m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_sqrtsd_mask_round( + (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sqrt_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_sqrtsd_mask_round( + (__v2df)__B, (__v2df)__A, (__v2df)__W, (__mmask8)__U, __R); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_sqrt_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_sqrtsd_mask_round( + (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_round_ss(__m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_sqrtss_mask_round( + (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sqrt_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_sqrtss_mask_round( + (__v4sf)__B, (__v4sf)__A, (__v4sf)__W, (__mmask8)__U, __R); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_sqrt_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_sqrtss_mask_round( + (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +} +#else +#define _mm512_sqrt_round_pd(A, C) \ + (__m512d) \ + __builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_sqrt_round_pd(W, U, A, C) \ + (__m512d) __builtin_ia32_sqrtpd512_mask(A, W, U, C) + +#define _mm512_maskz_sqrt_round_pd(U, A, C) \ + (__m512d) __builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_sqrt_round_ps(A, C) \ + (__m512) \ + __builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_undefined_ps(), -1, C) + +#define _mm512_mask_sqrt_round_ps(W, U, A, C) \ + (__m512) __builtin_ia32_sqrtps512_mask(A, W, U, C) + +#define _mm512_maskz_sqrt_round_ps(U, A, C) \ + (__m512) __builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm_sqrt_round_sd(A, B, C) \ + (__m128d) \ + __builtin_ia32_sqrtsd_mask_round(B, A, (__v2df)_mm_setzero_pd(), -1, C) + +#define _mm_mask_sqrt_round_sd(W, U, A, B, C) \ + (__m128d) __builtin_ia32_sqrtsd_mask_round(B, A, W, U, C) + +#define _mm_maskz_sqrt_round_sd(U, A, B, C) \ + (__m128d) \ + __builtin_ia32_sqrtsd_mask_round(B, A, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_sqrt_round_ss(A, B, C) \ + (__m128) \ + __builtin_ia32_sqrtss_mask_round(B, A, (__v4sf)_mm_setzero_ps(), -1, C) + +#define _mm_mask_sqrt_round_ss(W, U, A, B, C) \ + (__m128) __builtin_ia32_sqrtss_mask_round(B, A, W, U, C) + +#define _mm_maskz_sqrt_round_ss(U, A, B, C) \ + (__m128) \ + __builtin_ia32_sqrtss_mask_round(B, A, (__v4sf)_mm_setzero_ps(), U, C) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi8_epi32(__m128i __A) { + return (__m512i)__builtin_ia32_pmovsxbd512_mask( + (__v16qi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovsxbd512_mask((__v16qi)__A, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovsxbd512_mask( + (__v16qi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi8_epi64(__m128i __A) { + return (__m512i)__builtin_ia32_pmovsxbq512_mask( + (__v16qi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovsxbq512_mask((__v16qi)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovsxbq512_mask( + (__v16qi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi16_epi32(__m256i __A) { + return (__m512i)__builtin_ia32_pmovsxwd512_mask( + (__v16hi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { + return (__m512i)__builtin_ia32_pmovsxwd512_mask((__v16hi)__A, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) { + return (__m512i)__builtin_ia32_pmovsxwd512_mask( + (__v16hi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi16_epi64(__m128i __A) { + return (__m512i)__builtin_ia32_pmovsxwq512_mask( + (__v8hi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovsxwq512_mask((__v8hi)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovsxwq512_mask( + (__v8hi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi32_epi64(__m256i __X) { + return (__m512i)__builtin_ia32_pmovsxdq512_mask( + (__v8si)__X, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { + return (__m512i)__builtin_ia32_pmovsxdq512_mask((__v8si)__X, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) { + return (__m512i)__builtin_ia32_pmovsxdq512_mask( + (__v8si)__X, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu8_epi32(__m128i __A) { + return (__m512i)__builtin_ia32_pmovzxbd512_mask( + (__v16qi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovzxbd512_mask((__v16qi)__A, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovzxbd512_mask( + (__v16qi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu8_epi64(__m128i __A) { + return (__m512i)__builtin_ia32_pmovzxbq512_mask( + (__v16qi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovzxbq512_mask((__v16qi)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovzxbq512_mask( + (__v16qi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu16_epi32(__m256i __A) { + return (__m512i)__builtin_ia32_pmovzxwd512_mask( + (__v16hi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { + return (__m512i)__builtin_ia32_pmovzxwd512_mask((__v16hi)__A, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) { + return (__m512i)__builtin_ia32_pmovzxwd512_mask( + (__v16hi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu16_epi64(__m128i __A) { + return (__m512i)__builtin_ia32_pmovzxwq512_mask( + (__v8hi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovzxwq512_mask((__v8hi)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { + return (__m512i)__builtin_ia32_pmovzxwq512_mask( + (__v8hi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu32_epi64(__m256i __X) { + return (__m512i)__builtin_ia32_pmovzxdq512_mask( + (__v8si)__X, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { + return (__m512i)__builtin_ia32_pmovzxdq512_mask((__v8si)__X, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) { + return (__m512i)__builtin_ia32_pmovzxdq512_mask( + (__v8si)__X, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_add_round_pd(__m512d __A, __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_undefined_pd(), + (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_add_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_add_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { + return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_setzero_pd(), + (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_add_round_ps(__m512 __A, __m512 __B, const int __R) { + return (__m512)__builtin_ia32_addps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_add_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_addps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_add_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_addps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sub_round_pd(__m512d __A, __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_undefined_pd(), + (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sub_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { + return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_setzero_pd(), + (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sub_round_ps(__m512 __A, __m512 __B, const int __R) { + return (__m512)__builtin_ia32_subps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sub_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_subps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_subps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U, __R); +} +#else +#define _mm512_add_round_pd(A, B, C) \ + (__m512d) \ + __builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_add_round_pd(W, U, A, B, C) \ + (__m512d) __builtin_ia32_addpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_add_round_pd(U, A, B, C) \ + (__m512d) \ + __builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_add_round_ps(A, B, C) \ + (__m512) __builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), \ + -1, C) + +#define _mm512_mask_add_round_ps(W, U, A, B, C) \ + (__m512) __builtin_ia32_addps512_mask(A, B, W, U, C) + +#define _mm512_maskz_add_round_ps(U, A, B, C) \ + (__m512) \ + __builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_sub_round_pd(A, B, C) \ + (__m512d) \ + __builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_sub_round_pd(W, U, A, B, C) \ + (__m512d) __builtin_ia32_subpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_sub_round_pd(U, A, B, C) \ + (__m512d) \ + __builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_sub_round_ps(A, B, C) \ + (__m512) __builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), \ + -1, C) + +#define _mm512_mask_sub_round_ps(W, U, A, B, C) \ + (__m512) __builtin_ia32_subps512_mask(A, B, W, U, C) + +#define _mm512_maskz_sub_round_ps(U, A, B, C) \ + (__m512) \ + __builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mul_round_pd(__m512d __A, __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_undefined_pd(), + (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mul_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mul_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { + return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_setzero_pd(), + (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mul_round_ps(__m512 __A, __m512 __B, const int __R) { + return (__m512)__builtin_ia32_mulps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mul_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_mulps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mul_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_mulps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_div_round_pd(__m512d __M, __m512d __V, const int __R) { + return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, + (__v8df)_mm512_undefined_pd(), + (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_div_round_pd(__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __R) { + return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, + (__v8df)__W, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_div_round_pd(__mmask8 __U, __m512d __M, __m512d __V, + const int __R) { + return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, + (__v8df)_mm512_setzero_pd(), + (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_div_round_ps(__m512 __A, __m512 __B, const int __R) { + return (__m512)__builtin_ia32_divps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_div_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_divps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_div_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_divps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_round_sd(__m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_mulsd_round((__v2df)__A, (__v2df)__B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mul_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_mulsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_mul_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_mulsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_round_ss(__m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_mulss_round((__v4sf)__A, (__v4sf)__B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mul_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_mulss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_mul_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_mulss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_round_sd(__m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_divsd_round((__v2df)__A, (__v2df)__B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_div_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_divsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_div_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_divsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_round_ss(__m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_divss_round((__v4sf)__A, (__v4sf)__B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_div_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_divss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_div_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_divss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +} + +#else +#define _mm512_mul_round_pd(A, B, C) \ + (__m512d) \ + __builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_mul_round_pd(W, U, A, B, C) \ + (__m512d) __builtin_ia32_mulpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_mul_round_pd(U, A, B, C) \ + (__m512d) \ + __builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_mul_round_ps(A, B, C) \ + (__m512) __builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), \ + -1, C) + +#define _mm512_mask_mul_round_ps(W, U, A, B, C) \ + (__m512) __builtin_ia32_mulps512_mask(A, B, W, U, C) + +#define _mm512_maskz_mul_round_ps(U, A, B, C) \ + (__m512) \ + __builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_div_round_pd(A, B, C) \ + (__m512d) \ + __builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_div_round_pd(W, U, A, B, C) \ + (__m512d) __builtin_ia32_divpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_div_round_pd(U, A, B, C) \ + (__m512d) \ + __builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_div_round_ps(A, B, C) \ + (__m512) __builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), \ + -1, C) + +#define _mm512_mask_div_round_ps(W, U, A, B, C) \ + (__m512) __builtin_ia32_divps512_mask(A, B, W, U, C) + +#define _mm512_maskz_div_round_ps(U, A, B, C) \ + (__m512) \ + __builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm_mul_round_sd(A, B, C) (__m128d) __builtin_ia32_mulsd_round(A, B, C) + +#define _mm_mask_mul_round_sd(W, U, A, B, C) \ + (__m128d) __builtin_ia32_mulsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_mul_round_sd(U, A, B, C) \ + (__m128d) \ + __builtin_ia32_mulsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_mul_round_ss(A, B, C) (__m128) __builtin_ia32_mulss_round(A, B, C) + +#define _mm_mask_mul_round_ss(W, U, A, B, C) \ + (__m128) __builtin_ia32_mulss_mask_round(A, B, W, U, C) + +#define _mm_maskz_mul_round_ss(U, A, B, C) \ + (__m128) __builtin_ia32_mulss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_div_round_sd(A, B, C) (__m128d) __builtin_ia32_divsd_round(A, B, C) + +#define _mm_mask_div_round_sd(W, U, A, B, C) \ + (__m128d) __builtin_ia32_divsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_div_round_sd(U, A, B, C) \ + (__m128d) \ + __builtin_ia32_divsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_div_round_ss(A, B, C) (__m128) __builtin_ia32_divss_round(A, B, C) + +#define _mm_mask_div_round_ss(W, U, A, B, C) \ + (__m128) __builtin_ia32_divss_mask_round(A, B, W, U, C) + +#define _mm_maskz_div_round_ss(U, A, B, C) \ + (__m128) __builtin_ia32_divss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_round_pd(__m512d __A, __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_undefined_pd(), + (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_max_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { + return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_setzero_pd(), + (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_round_ps(__m512 __A, __m512 __B, const int __R) { + return (__m512)__builtin_ia32_maxps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_max_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_maxps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_maxps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_round_pd(__m512d __A, __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_undefined_pd(), + (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_min_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { + return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_setzero_pd(), + (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_round_ps(__m512 __A, __m512 __B, const int __R) { + return (__m512)__builtin_ia32_minps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_min_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_minps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_minps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U, __R); +} +#else +#define _mm512_max_round_pd(A, B, R) \ + (__m512d) \ + __builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R) + +#define _mm512_mask_max_round_pd(W, U, A, B, R) \ + (__m512d) __builtin_ia32_maxpd512_mask(A, B, W, U, R) + +#define _mm512_maskz_max_round_pd(U, A, B, R) \ + (__m512d) \ + __builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) + +#define _mm512_max_round_ps(A, B, R) \ + (__m512) __builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_undefined_pd(), \ + -1, R) + +#define _mm512_mask_max_round_ps(W, U, A, B, R) \ + (__m512) __builtin_ia32_maxps512_mask(A, B, W, U, R) + +#define _mm512_maskz_max_round_ps(U, A, B, R) \ + (__m512) \ + __builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) + +#define _mm512_min_round_pd(A, B, R) \ + (__m512d) \ + __builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R) + +#define _mm512_mask_min_round_pd(W, U, A, B, R) \ + (__m512d) __builtin_ia32_minpd512_mask(A, B, W, U, R) + +#define _mm512_maskz_min_round_pd(U, A, B, R) \ + (__m512d) \ + __builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) + +#define _mm512_min_round_ps(A, B, R) \ + (__m512) __builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), \ + -1, R) + +#define _mm512_mask_min_round_ps(W, U, A, B, R) \ + (__m512) __builtin_ia32_minps512_mask(A, B, W, U, R) + +#define _mm512_maskz_min_round_ps(U, A, B, R) \ + (__m512) \ + __builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_scalef_round_pd(__m512d __A, __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_undefined_pd(), + (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_scalef_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { + return (__m512d)__builtin_ia32_scalefpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_scalef_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + const int __R) { + return (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)_mm512_setzero_pd(), + (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_scalef_round_ps(__m512 __A, __m512 __B, const int __R) { + return (__m512)__builtin_ia32_scalefps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_scalef_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { + return (__m512)__builtin_ia32_scalefps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_scalef_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { + return (__m512)__builtin_ia32_scalefps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_scalef_round_sd(__m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_scalefsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)-1, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_scalef_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_scalefsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_scalef_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_scalefsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_scalef_round_ss(__m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_scalefss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_scalef_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_scalefss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_scalef_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_scalefss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +} +#else +#define _mm512_scalef_round_pd(A, B, C) \ + (__m512d) __builtin_ia32_scalefpd512_mask( \ + A, B, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \ + (__m512d) __builtin_ia32_scalefpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_scalef_round_pd(U, A, B, C) \ + (__m512d) \ + __builtin_ia32_scalefpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_scalef_round_ps(A, B, C) \ + (__m512) __builtin_ia32_scalefps512_mask( \ + A, B, (__v16sf)_mm512_undefined_ps(), -1, C) + +#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \ + (__m512) __builtin_ia32_scalefps512_mask(A, B, W, U, C) + +#define _mm512_maskz_scalef_round_ps(U, A, B, C) \ + (__m512) __builtin_ia32_scalefps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), \ + U, C) + +#define _mm_scalef_round_sd(A, B, C) \ + (__m128d) __builtin_ia32_scalefsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), \ + -1, C) + +#define _mm_scalef_round_ss(A, B, C) \ + (__m128) __builtin_ia32_scalefss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), \ + -1, C) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmaddpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmaddpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { + return (__m512d)__builtin_ia32_vfmaddpd512_mask3( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmaddpd512_maskz( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmaddps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmaddps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { + return (__m512)__builtin_ia32_vfmaddps512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmaddps512_maskz( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmsubpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmsubpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { + return (__m512d)__builtin_ia32_vfmsubpd512_mask3( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmsubpd512_maskz( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmsubps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmsubps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { + return (__m512)__builtin_ia32_vfmsubps512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmsubps512_maskz( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmaddsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_mask3( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmaddsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmaddsubps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmaddsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmaddsubps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { + return (__m512)__builtin_ia32_vfmaddsubps512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmaddsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmaddsubps512_maskz( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( + (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmsubadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( + (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { + return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmsubadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( + (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmaddsubps512_mask( + (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmsubadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmaddsubps512_mask( + (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { + return (__m512)__builtin_ia32_vfmsubaddps512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmsubadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfmaddsubps512_maskz( + (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfnmaddpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fnmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfnmaddpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { + return (__m512d)__builtin_ia32_vfnmaddpd512_mask3( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fnmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfnmaddpd512_maskz( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfnmaddps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fnmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfnmaddps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { + return (__m512)__builtin_ia32_vfnmaddps512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fnmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfnmaddps512_maskz( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfnmsubpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fnmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfnmsubpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) { + return (__m512d)__builtin_ia32_vfnmsubpd512_mask3( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fnmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) { + return (__m512d)__builtin_ia32_vfnmsubpd512_maskz( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfnmsubps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fnmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfnmsubps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { + return (__m512)__builtin_ia32_vfnmsubps512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fnmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) { + return (__m512)__builtin_ia32_vfnmsubps512_maskz( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); +} +#else +#define _mm512_fmadd_round_pd(A, B, C, R) \ + (__m512d) __builtin_ia32_vfmaddpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ + (__m512d) __builtin_ia32_vfmaddpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ + (__m512d) __builtin_ia32_vfmaddpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ + (__m512d) __builtin_ia32_vfmaddpd512_maskz(A, B, C, U, R) + +#define _mm512_fmadd_round_ps(A, B, C, R) \ + (__m512) __builtin_ia32_vfmaddps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ + (__m512) __builtin_ia32_vfmaddps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ + (__m512) __builtin_ia32_vfmaddps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ + (__m512) __builtin_ia32_vfmaddps512_maskz(A, B, C, U, R) + +#define _mm512_fmsub_round_pd(A, B, C, R) \ + (__m512d) __builtin_ia32_vfmsubpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ + (__m512d) __builtin_ia32_vfmsubpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ + (__m512d) __builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ + (__m512d) __builtin_ia32_vfmsubpd512_maskz(A, B, C, U, R) + +#define _mm512_fmsub_round_ps(A, B, C, R) \ + (__m512) __builtin_ia32_vfmsubps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ + (__m512) __builtin_ia32_vfmsubps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ + (__m512) __builtin_ia32_vfmsubps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ + (__m512) __builtin_ia32_vfmsubps512_maskz(A, B, C, U, R) + +#define _mm512_fmaddsub_round_pd(A, B, C, R) \ + (__m512d) __builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ + (__m512d) __builtin_ia32_vfmaddsubpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ + (__m512d) __builtin_ia32_vfmaddsubpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ + (__m512d) __builtin_ia32_vfmaddsubpd512_maskz(A, B, C, U, R) + +#define _mm512_fmaddsub_round_ps(A, B, C, R) \ + (__m512) __builtin_ia32_vfmaddsubps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ + (__m512) __builtin_ia32_vfmaddsubps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ + (__m512) __builtin_ia32_vfmaddsubps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ + (__m512) __builtin_ia32_vfmaddsubps512_maskz(A, B, C, U, R) + +#define _mm512_fmsubadd_round_pd(A, B, C, R) \ + (__m512d) __builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), -1, R) + +#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ + (__m512d) __builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), U, R) + +#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ + (__m512d) __builtin_ia32_vfmsubaddpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ + (__m512d) __builtin_ia32_vfmaddsubpd512_maskz(A, B, -(C), U, R) + +#define _mm512_fmsubadd_round_ps(A, B, C, R) \ + (__m512) __builtin_ia32_vfmaddsubps512_mask(A, B, -(C), -1, R) + +#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ + (__m512) __builtin_ia32_vfmaddsubps512_mask(A, B, -(C), U, R) + +#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ + (__m512) __builtin_ia32_vfmsubaddps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ + (__m512) __builtin_ia32_vfmaddsubps512_maskz(A, B, -(C), U, R) + +#define _mm512_fnmadd_round_pd(A, B, C, R) \ + (__m512d) __builtin_ia32_vfnmaddpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ + (__m512d) __builtin_ia32_vfnmaddpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ + (__m512d) __builtin_ia32_vfnmaddpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ + (__m512d) __builtin_ia32_vfnmaddpd512_maskz(A, B, C, U, R) + +#define _mm512_fnmadd_round_ps(A, B, C, R) \ + (__m512) __builtin_ia32_vfnmaddps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ + (__m512) __builtin_ia32_vfnmaddps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ + (__m512) __builtin_ia32_vfnmaddps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ + (__m512) __builtin_ia32_vfnmaddps512_maskz(A, B, C, U, R) + +#define _mm512_fnmsub_round_pd(A, B, C, R) \ + (__m512d) __builtin_ia32_vfnmsubpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ + (__m512d) __builtin_ia32_vfnmsubpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ + (__m512d) __builtin_ia32_vfnmsubpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ + (__m512d) __builtin_ia32_vfnmsubpd512_maskz(A, B, C, U, R) + +#define _mm512_fnmsub_round_ps(A, B, C, R) \ + (__m512) __builtin_ia32_vfnmsubps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ + (__m512) __builtin_ia32_vfnmsubps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ + (__m512) __builtin_ia32_vfnmsubps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ + (__m512) __builtin_ia32_vfnmsubps512_maskz(A, B, C, U, R) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_abs_epi64(__m512i __A) { + return (__m512i)__builtin_ia32_pabsq512_mask( + (__v8di)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_abs_epi64(__m512i __W, __mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_pabsq512_mask((__v8di)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_abs_epi64(__mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_pabsq512_mask( + (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_abs_epi32(__m512i __A) { + return (__m512i)__builtin_ia32_pabsd512_mask( + (__v16si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_abs_epi32(__m512i __W, __mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_pabsd512_mask((__v16si)__A, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_abs_epi32(__mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_pabsd512_mask( + (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcastss_ps(__m128 __A) { + return (__m512)__builtin_ia32_broadcastss512( + (__v4sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) { + return (__m512)__builtin_ia32_broadcastss512((__v4sf)__A, (__v16sf)__O, __M); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) { + return (__m512)__builtin_ia32_broadcastss512( + (__v4sf)__A, (__v16sf)_mm512_setzero_ps(), __M); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcastsd_pd(__m128d __A) { + return (__m512d)__builtin_ia32_broadcastsd512( + (__v2df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) { + return (__m512d)__builtin_ia32_broadcastsd512((__v2df)__A, (__v8df)__O, __M); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) { + return (__m512d)__builtin_ia32_broadcastsd512( + (__v2df)__A, (__v8df)_mm512_setzero_pd(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcastd_epi32(__m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastd512( + (__v4si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastd512((__v4si)__A, (__v16si)__O, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastd512( + (__v4si)__A, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set1_epi32(int __A) { + return (__m512i)__builtin_ia32_pbroadcastd512_gpr_mask( + __A, (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1)); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_set1_epi32(__m512i __O, __mmask16 __M, int __A) { + return (__m512i)__builtin_ia32_pbroadcastd512_gpr_mask(__A, (__v16si)__O, + __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_set1_epi32(__mmask16 __M, int __A) { + return (__m512i)__builtin_ia32_pbroadcastd512_gpr_mask( + __A, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcastq_epi64(__m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastq512( + (__v2di)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastq512((__v2di)__A, (__v8di)__O, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { + return (__m512i)__builtin_ia32_pbroadcastq512( + (__v2di)__A, (__v8di)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_set1_epi64(long long __A) { + return (__m512i)__builtin_ia32_pbroadcastq512_gpr_mask( + __A, (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1)); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_set1_epi64(__m512i __O, __mmask8 __M, long long __A) { + return (__m512i)__builtin_ia32_pbroadcastq512_gpr_mask(__A, (__v8di)__O, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) { + return (__m512i)__builtin_ia32_pbroadcastq512_gpr_mask( + __A, (__v8di)_mm512_setzero_si512(), __M); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcast_f32x4(__m128 __A) { + return (__m512)__builtin_ia32_broadcastf32x4_512( + (__v4sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) { + return (__m512)__builtin_ia32_broadcastf32x4_512((__v4sf)__A, (__v16sf)__O, + __M); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) { + return (__m512)__builtin_ia32_broadcastf32x4_512( + (__v4sf)__A, (__v16sf)_mm512_setzero_ps(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcast_i32x4(__m128i __A) { + return (__m512i)__builtin_ia32_broadcasti32x4_512( + (__v4si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) { + return (__m512i)__builtin_ia32_broadcasti32x4_512((__v4si)__A, (__v16si)__O, + __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) { + return (__m512i)__builtin_ia32_broadcasti32x4_512( + (__v4si)__A, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcast_f64x4(__m256d __A) { + return (__m512d)__builtin_ia32_broadcastf64x4_512( + (__v4df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) { + return (__m512d)__builtin_ia32_broadcastf64x4_512((__v4df)__A, (__v8df)__O, + __M); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) { + return (__m512d)__builtin_ia32_broadcastf64x4_512( + (__v4df)__A, (__v8df)_mm512_setzero_pd(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_broadcast_i64x4(__m256i __A) { + return (__m512i)__builtin_ia32_broadcasti64x4_512( + (__v4di)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) { + return (__m512i)__builtin_ia32_broadcasti64x4_512((__v4di)__A, (__v8di)__O, + __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) { + return (__m512i)__builtin_ia32_broadcasti64x4_512( + (__v4di)__A, (__v8di)_mm512_setzero_si512(), __M); +} + +typedef enum { + _MM_PERM_AAAA = 0x00, + _MM_PERM_AAAB = 0x01, + _MM_PERM_AAAC = 0x02, + _MM_PERM_AAAD = 0x03, + _MM_PERM_AABA = 0x04, + _MM_PERM_AABB = 0x05, + _MM_PERM_AABC = 0x06, + _MM_PERM_AABD = 0x07, + _MM_PERM_AACA = 0x08, + _MM_PERM_AACB = 0x09, + _MM_PERM_AACC = 0x0A, + _MM_PERM_AACD = 0x0B, + _MM_PERM_AADA = 0x0C, + _MM_PERM_AADB = 0x0D, + _MM_PERM_AADC = 0x0E, + _MM_PERM_AADD = 0x0F, + _MM_PERM_ABAA = 0x10, + _MM_PERM_ABAB = 0x11, + _MM_PERM_ABAC = 0x12, + _MM_PERM_ABAD = 0x13, + _MM_PERM_ABBA = 0x14, + _MM_PERM_ABBB = 0x15, + _MM_PERM_ABBC = 0x16, + _MM_PERM_ABBD = 0x17, + _MM_PERM_ABCA = 0x18, + _MM_PERM_ABCB = 0x19, + _MM_PERM_ABCC = 0x1A, + _MM_PERM_ABCD = 0x1B, + _MM_PERM_ABDA = 0x1C, + _MM_PERM_ABDB = 0x1D, + _MM_PERM_ABDC = 0x1E, + _MM_PERM_ABDD = 0x1F, + _MM_PERM_ACAA = 0x20, + _MM_PERM_ACAB = 0x21, + _MM_PERM_ACAC = 0x22, + _MM_PERM_ACAD = 0x23, + _MM_PERM_ACBA = 0x24, + _MM_PERM_ACBB = 0x25, + _MM_PERM_ACBC = 0x26, + _MM_PERM_ACBD = 0x27, + _MM_PERM_ACCA = 0x28, + _MM_PERM_ACCB = 0x29, + _MM_PERM_ACCC = 0x2A, + _MM_PERM_ACCD = 0x2B, + _MM_PERM_ACDA = 0x2C, + _MM_PERM_ACDB = 0x2D, + _MM_PERM_ACDC = 0x2E, + _MM_PERM_ACDD = 0x2F, + _MM_PERM_ADAA = 0x30, + _MM_PERM_ADAB = 0x31, + _MM_PERM_ADAC = 0x32, + _MM_PERM_ADAD = 0x33, + _MM_PERM_ADBA = 0x34, + _MM_PERM_ADBB = 0x35, + _MM_PERM_ADBC = 0x36, + _MM_PERM_ADBD = 0x37, + _MM_PERM_ADCA = 0x38, + _MM_PERM_ADCB = 0x39, + _MM_PERM_ADCC = 0x3A, + _MM_PERM_ADCD = 0x3B, + _MM_PERM_ADDA = 0x3C, + _MM_PERM_ADDB = 0x3D, + _MM_PERM_ADDC = 0x3E, + _MM_PERM_ADDD = 0x3F, + _MM_PERM_BAAA = 0x40, + _MM_PERM_BAAB = 0x41, + _MM_PERM_BAAC = 0x42, + _MM_PERM_BAAD = 0x43, + _MM_PERM_BABA = 0x44, + _MM_PERM_BABB = 0x45, + _MM_PERM_BABC = 0x46, + _MM_PERM_BABD = 0x47, + _MM_PERM_BACA = 0x48, + _MM_PERM_BACB = 0x49, + _MM_PERM_BACC = 0x4A, + _MM_PERM_BACD = 0x4B, + _MM_PERM_BADA = 0x4C, + _MM_PERM_BADB = 0x4D, + _MM_PERM_BADC = 0x4E, + _MM_PERM_BADD = 0x4F, + _MM_PERM_BBAA = 0x50, + _MM_PERM_BBAB = 0x51, + _MM_PERM_BBAC = 0x52, + _MM_PERM_BBAD = 0x53, + _MM_PERM_BBBA = 0x54, + _MM_PERM_BBBB = 0x55, + _MM_PERM_BBBC = 0x56, + _MM_PERM_BBBD = 0x57, + _MM_PERM_BBCA = 0x58, + _MM_PERM_BBCB = 0x59, + _MM_PERM_BBCC = 0x5A, + _MM_PERM_BBCD = 0x5B, + _MM_PERM_BBDA = 0x5C, + _MM_PERM_BBDB = 0x5D, + _MM_PERM_BBDC = 0x5E, + _MM_PERM_BBDD = 0x5F, + _MM_PERM_BCAA = 0x60, + _MM_PERM_BCAB = 0x61, + _MM_PERM_BCAC = 0x62, + _MM_PERM_BCAD = 0x63, + _MM_PERM_BCBA = 0x64, + _MM_PERM_BCBB = 0x65, + _MM_PERM_BCBC = 0x66, + _MM_PERM_BCBD = 0x67, + _MM_PERM_BCCA = 0x68, + _MM_PERM_BCCB = 0x69, + _MM_PERM_BCCC = 0x6A, + _MM_PERM_BCCD = 0x6B, + _MM_PERM_BCDA = 0x6C, + _MM_PERM_BCDB = 0x6D, + _MM_PERM_BCDC = 0x6E, + _MM_PERM_BCDD = 0x6F, + _MM_PERM_BDAA = 0x70, + _MM_PERM_BDAB = 0x71, + _MM_PERM_BDAC = 0x72, + _MM_PERM_BDAD = 0x73, + _MM_PERM_BDBA = 0x74, + _MM_PERM_BDBB = 0x75, + _MM_PERM_BDBC = 0x76, + _MM_PERM_BDBD = 0x77, + _MM_PERM_BDCA = 0x78, + _MM_PERM_BDCB = 0x79, + _MM_PERM_BDCC = 0x7A, + _MM_PERM_BDCD = 0x7B, + _MM_PERM_BDDA = 0x7C, + _MM_PERM_BDDB = 0x7D, + _MM_PERM_BDDC = 0x7E, + _MM_PERM_BDDD = 0x7F, + _MM_PERM_CAAA = 0x80, + _MM_PERM_CAAB = 0x81, + _MM_PERM_CAAC = 0x82, + _MM_PERM_CAAD = 0x83, + _MM_PERM_CABA = 0x84, + _MM_PERM_CABB = 0x85, + _MM_PERM_CABC = 0x86, + _MM_PERM_CABD = 0x87, + _MM_PERM_CACA = 0x88, + _MM_PERM_CACB = 0x89, + _MM_PERM_CACC = 0x8A, + _MM_PERM_CACD = 0x8B, + _MM_PERM_CADA = 0x8C, + _MM_PERM_CADB = 0x8D, + _MM_PERM_CADC = 0x8E, + _MM_PERM_CADD = 0x8F, + _MM_PERM_CBAA = 0x90, + _MM_PERM_CBAB = 0x91, + _MM_PERM_CBAC = 0x92, + _MM_PERM_CBAD = 0x93, + _MM_PERM_CBBA = 0x94, + _MM_PERM_CBBB = 0x95, + _MM_PERM_CBBC = 0x96, + _MM_PERM_CBBD = 0x97, + _MM_PERM_CBCA = 0x98, + _MM_PERM_CBCB = 0x99, + _MM_PERM_CBCC = 0x9A, + _MM_PERM_CBCD = 0x9B, + _MM_PERM_CBDA = 0x9C, + _MM_PERM_CBDB = 0x9D, + _MM_PERM_CBDC = 0x9E, + _MM_PERM_CBDD = 0x9F, + _MM_PERM_CCAA = 0xA0, + _MM_PERM_CCAB = 0xA1, + _MM_PERM_CCAC = 0xA2, + _MM_PERM_CCAD = 0xA3, + _MM_PERM_CCBA = 0xA4, + _MM_PERM_CCBB = 0xA5, + _MM_PERM_CCBC = 0xA6, + _MM_PERM_CCBD = 0xA7, + _MM_PERM_CCCA = 0xA8, + _MM_PERM_CCCB = 0xA9, + _MM_PERM_CCCC = 0xAA, + _MM_PERM_CCCD = 0xAB, + _MM_PERM_CCDA = 0xAC, + _MM_PERM_CCDB = 0xAD, + _MM_PERM_CCDC = 0xAE, + _MM_PERM_CCDD = 0xAF, + _MM_PERM_CDAA = 0xB0, + _MM_PERM_CDAB = 0xB1, + _MM_PERM_CDAC = 0xB2, + _MM_PERM_CDAD = 0xB3, + _MM_PERM_CDBA = 0xB4, + _MM_PERM_CDBB = 0xB5, + _MM_PERM_CDBC = 0xB6, + _MM_PERM_CDBD = 0xB7, + _MM_PERM_CDCA = 0xB8, + _MM_PERM_CDCB = 0xB9, + _MM_PERM_CDCC = 0xBA, + _MM_PERM_CDCD = 0xBB, + _MM_PERM_CDDA = 0xBC, + _MM_PERM_CDDB = 0xBD, + _MM_PERM_CDDC = 0xBE, + _MM_PERM_CDDD = 0xBF, + _MM_PERM_DAAA = 0xC0, + _MM_PERM_DAAB = 0xC1, + _MM_PERM_DAAC = 0xC2, + _MM_PERM_DAAD = 0xC3, + _MM_PERM_DABA = 0xC4, + _MM_PERM_DABB = 0xC5, + _MM_PERM_DABC = 0xC6, + _MM_PERM_DABD = 0xC7, + _MM_PERM_DACA = 0xC8, + _MM_PERM_DACB = 0xC9, + _MM_PERM_DACC = 0xCA, + _MM_PERM_DACD = 0xCB, + _MM_PERM_DADA = 0xCC, + _MM_PERM_DADB = 0xCD, + _MM_PERM_DADC = 0xCE, + _MM_PERM_DADD = 0xCF, + _MM_PERM_DBAA = 0xD0, + _MM_PERM_DBAB = 0xD1, + _MM_PERM_DBAC = 0xD2, + _MM_PERM_DBAD = 0xD3, + _MM_PERM_DBBA = 0xD4, + _MM_PERM_DBBB = 0xD5, + _MM_PERM_DBBC = 0xD6, + _MM_PERM_DBBD = 0xD7, + _MM_PERM_DBCA = 0xD8, + _MM_PERM_DBCB = 0xD9, + _MM_PERM_DBCC = 0xDA, + _MM_PERM_DBCD = 0xDB, + _MM_PERM_DBDA = 0xDC, + _MM_PERM_DBDB = 0xDD, + _MM_PERM_DBDC = 0xDE, + _MM_PERM_DBDD = 0xDF, + _MM_PERM_DCAA = 0xE0, + _MM_PERM_DCAB = 0xE1, + _MM_PERM_DCAC = 0xE2, + _MM_PERM_DCAD = 0xE3, + _MM_PERM_DCBA = 0xE4, + _MM_PERM_DCBB = 0xE5, + _MM_PERM_DCBC = 0xE6, + _MM_PERM_DCBD = 0xE7, + _MM_PERM_DCCA = 0xE8, + _MM_PERM_DCCB = 0xE9, + _MM_PERM_DCCC = 0xEA, + _MM_PERM_DCCD = 0xEB, + _MM_PERM_DCDA = 0xEC, + _MM_PERM_DCDB = 0xED, + _MM_PERM_DCDC = 0xEE, + _MM_PERM_DCDD = 0xEF, + _MM_PERM_DDAA = 0xF0, + _MM_PERM_DDAB = 0xF1, + _MM_PERM_DDAC = 0xF2, + _MM_PERM_DDAD = 0xF3, + _MM_PERM_DDBA = 0xF4, + _MM_PERM_DDBB = 0xF5, + _MM_PERM_DDBC = 0xF6, + _MM_PERM_DDBD = 0xF7, + _MM_PERM_DDCA = 0xF8, + _MM_PERM_DDCB = 0xF9, + _MM_PERM_DDCC = 0xFA, + _MM_PERM_DDCD = 0xFB, + _MM_PERM_DDDA = 0xFC, + _MM_PERM_DDDB = 0xFD, + _MM_PERM_DDDC = 0xFE, + _MM_PERM_DDDD = 0xFF +} _MM_PERM_ENUM; + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shuffle_epi32(__m512i __A, _MM_PERM_ENUM __mask) { + return (__m512i)__builtin_ia32_pshufd512_mask( + (__v16si)__A, __mask, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shuffle_epi32(__m512i __W, __mmask16 __U, __m512i __A, + _MM_PERM_ENUM __mask) { + return (__m512i)__builtin_ia32_pshufd512_mask((__v16si)__A, __mask, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_shuffle_epi32(__mmask16 __U, __m512i __A, _MM_PERM_ENUM __mask) { + return (__m512i)__builtin_ia32_pshufd512_mask( + (__v16si)__A, __mask, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shuffle_i64x2(__m512i __A, __m512i __B, const int __imm) { + return (__m512i)__builtin_ia32_shuf_i64x2_mask( + (__v8di)__A, (__v8di)__B, __imm, (__v8di)_mm512_undefined_epi32(), + (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) { + return (__m512i)__builtin_ia32_shuf_i64x2_mask( + (__v8di)__A, (__v8di)__B, __imm, (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B, + const int __imm) { + return (__m512i)__builtin_ia32_shuf_i64x2_mask( + (__v8di)__A, (__v8di)__B, __imm, (__v8di)_mm512_setzero_si512(), + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shuffle_i32x4(__m512i __A, __m512i __B, const int __imm) { + return (__m512i)__builtin_ia32_shuf_i32x4_mask( + (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B, const int __imm) { + return (__m512i)__builtin_ia32_shuf_i32x4_mask( + (__v16si)__A, (__v16si)__B, __imm, (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B, + const int __imm) { + return (__m512i)__builtin_ia32_shuf_i32x4_mask( + (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shuffle_f64x2(__m512d __A, __m512d __B, const int __imm) { + return (__m512d)__builtin_ia32_shuf_f64x2_mask( + (__v8df)__A, (__v8df)__B, __imm, (__v8df)_mm512_undefined_pd(), + (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __imm) { + return (__m512d)__builtin_ia32_shuf_f64x2_mask( + (__v8df)__A, (__v8df)__B, __imm, (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B, + const int __imm) { + return (__m512d)__builtin_ia32_shuf_f64x2_mask( + (__v8df)__A, (__v8df)__B, __imm, (__v8df)_mm512_setzero_pd(), + (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shuffle_f32x4(__m512 __A, __m512 __B, const int __imm) { + return (__m512)__builtin_ia32_shuf_f32x4_mask( + (__v16sf)__A, (__v16sf)__B, __imm, (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, + const int __imm) { + return (__m512)__builtin_ia32_shuf_f32x4_mask( + (__v16sf)__A, (__v16sf)__B, __imm, (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B, + const int __imm) { + return (__m512)__builtin_ia32_shuf_f32x4_mask( + (__v16sf)__A, (__v16sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U); +} + +#else +#define _mm512_shuffle_epi32(X, C) \ + ((__m512i)__builtin_ia32_pshufd512_mask( \ + (__v16si)(__m512i)(X), (int)(C), \ + (__v16si)(__m512i)_mm512_undefined_epi32(), (__mmask16)-1)) + +#define _mm512_mask_shuffle_epi32(W, U, X, C) \ + ((__m512i)__builtin_ia32_pshufd512_mask( \ + (__v16si)(__m512i)(X), (int)(C), (__v16si)(__m512i)(W), (__mmask16)(U))) + +#define _mm512_maskz_shuffle_epi32(U, X, C) \ + ((__m512i)__builtin_ia32_pshufd512_mask( \ + (__v16si)(__m512i)(X), (int)(C), \ + (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) + +#define _mm512_shuffle_i64x2(X, Y, C) \ + ((__m512i)__builtin_ia32_shuf_i64x2_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ + (__v8di)(__m512i)_mm512_undefined_epi32(), (__mmask8)-1)) + +#define _mm512_mask_shuffle_i64x2(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_shuf_i64x2_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ + (__v8di)(__m512i)(W), (__mmask8)(U))) + +#define _mm512_maskz_shuffle_i64x2(U, X, Y, C) \ + ((__m512i)__builtin_ia32_shuf_i64x2_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ + (__v8di)(__m512i)_mm512_setzero_si512(), (__mmask8)(U))) + +#define _mm512_shuffle_i32x4(X, Y, C) \ + ((__m512i)__builtin_ia32_shuf_i32x4_mask( \ + (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ + (__v16si)(__m512i)_mm512_undefined_epi32(), (__mmask16)-1)) + +#define _mm512_mask_shuffle_i32x4(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_shuf_i32x4_mask( \ + (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ + (__v16si)(__m512i)(W), (__mmask16)(U))) + +#define _mm512_maskz_shuffle_i32x4(U, X, Y, C) \ + ((__m512i)__builtin_ia32_shuf_i32x4_mask( \ + (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ + (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) + +#define _mm512_shuffle_f64x2(X, Y, C) \ + ((__m512d)__builtin_ia32_shuf_f64x2_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ + (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) + +#define _mm512_mask_shuffle_f64x2(W, U, X, Y, C) \ + ((__m512d)__builtin_ia32_shuf_f64x2_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U))) + +#define _mm512_maskz_shuffle_f64x2(U, X, Y, C) \ + ((__m512d)__builtin_ia32_shuf_f64x2_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ + (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) + +#define _mm512_shuffle_f32x4(X, Y, C) \ + ((__m512)__builtin_ia32_shuf_f32x4_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ + (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1)) + +#define _mm512_mask_shuffle_f32x4(W, U, X, Y, C) \ + ((__m512)__builtin_ia32_shuf_f32x4_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U))) + +#define _mm512_maskz_shuffle_f32x4(U, X, Y, C) \ + ((__m512)__builtin_ia32_shuf_f32x4_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ + (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rolv_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prolvd512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_rolv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prolvd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rolv_epi32(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prolvd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rorv_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prorvd512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_rorv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prorvd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rorv_epi32(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prorvd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rolv_epi64(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prolvq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_rolv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prolvq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rolv_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prolvq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rorv_epi64(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prorvq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_rorv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prorvq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rorv_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_prorvq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtt_roundpd_epi32(__m512d __A, const int __R) { + return (__m256i)__builtin_ia32_cvttpd2dq512_mask( + (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtt_roundpd_epi32(__m256i __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)__A, (__v8si)__W, + (__mmask8)__U, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtt_roundpd_epi32(__mmask8 __U, __m512d __A, const int __R) { + return (__m256i)__builtin_ia32_cvttpd2dq512_mask( + (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtt_roundpd_epu32(__m512d __A, const int __R) { + return (__m256i)__builtin_ia32_cvttpd2udq512_mask( + (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtt_roundpd_epu32(__m256i __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)__A, (__v8si)__W, + (__mmask8)__U, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtt_roundpd_epu32(__mmask8 __U, __m512d __A, const int __R) { + return (__m256i)__builtin_ia32_cvttpd2udq512_mask( + (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); +} +#else +#define _mm512_cvtt_roundpd_epi32(A, B) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask( \ + A, (__v8si)_mm256_undefined_si256(), -1, B)) + +#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundpd_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask( \ + A, (__v8si)_mm256_setzero_si256(), U, B)) + +#define _mm512_cvtt_roundpd_epu32(A, B) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask( \ + A, (__v8si)_mm256_undefined_si256(), -1, B)) + +#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundpd_epu32(U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask( \ + A, (__v8si)_mm256_setzero_si256(), U, B)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundpd_epi32(__m512d __A, const int __R) { + return (__m256i)__builtin_ia32_cvtpd2dq512_mask( + (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundpd_epi32(__m256i __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)__A, (__v8si)__W, + (__mmask8)__U, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundpd_epi32(__mmask8 __U, __m512d __A, const int __R) { + return (__m256i)__builtin_ia32_cvtpd2dq512_mask( + (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundpd_epu32(__m512d __A, const int __R) { + return (__m256i)__builtin_ia32_cvtpd2udq512_mask( + (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundpd_epu32(__m256i __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)__A, (__v8si)__W, + (__mmask8)__U, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundpd_epu32(__mmask8 __U, __m512d __A, const int __R) { + return (__m256i)__builtin_ia32_cvtpd2udq512_mask( + (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); +} +#else +#define _mm512_cvt_roundpd_epi32(A, B) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask( \ + A, (__v8si)_mm256_undefined_si256(), -1, B)) + +#define _mm512_mask_cvt_roundpd_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvt_roundpd_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), \ + U, B)) + +#define _mm512_cvt_roundpd_epu32(A, B) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask( \ + A, (__v8si)_mm256_undefined_si256(), -1, B)) + +#define _mm512_mask_cvt_roundpd_epu32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvt_roundpd_epu32(U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask( \ + A, (__v8si)_mm256_setzero_si256(), U, B)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtt_roundps_epi32(__m512 __A, const int __R) { + return (__m512i)__builtin_ia32_cvttps2dq512_mask( + (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtt_roundps_epi32(__m512i __W, __mmask16 __U, __m512 __A, + const int __R) { + return (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)__A, (__v16si)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtt_roundps_epi32(__mmask16 __U, __m512 __A, const int __R) { + return (__m512i)__builtin_ia32_cvttps2dq512_mask( + (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtt_roundps_epu32(__m512 __A, const int __R) { + return (__m512i)__builtin_ia32_cvttps2udq512_mask( + (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtt_roundps_epu32(__m512i __W, __mmask16 __U, __m512 __A, + const int __R) { + return (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)__A, (__v16si)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtt_roundps_epu32(__mmask16 __U, __m512 __A, const int __R) { + return (__m512i)__builtin_ia32_cvttps2udq512_mask( + (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); +} +#else +#define _mm512_cvtt_roundps_epi32(A, B) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask( \ + A, (__v16si)_mm512_undefined_epi32(), -1, B)) + +#define _mm512_mask_cvtt_roundps_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundps_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask( \ + A, (__v16si)_mm512_setzero_si512(), U, B)) + +#define _mm512_cvtt_roundps_epu32(A, B) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask( \ + A, (__v16si)_mm512_undefined_epi32(), -1, B)) + +#define _mm512_mask_cvtt_roundps_epu32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundps_epu32(U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask( \ + A, (__v16si)_mm512_setzero_si512(), U, B)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundps_epi32(__m512 __A, const int __R) { + return (__m512i)__builtin_ia32_cvtps2dq512_mask( + (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundps_epi32(__m512i __W, __mmask16 __U, __m512 __A, + const int __R) { + return (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)__A, (__v16si)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundps_epi32(__mmask16 __U, __m512 __A, const int __R) { + return (__m512i)__builtin_ia32_cvtps2dq512_mask( + (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundps_epu32(__m512 __A, const int __R) { + return (__m512i)__builtin_ia32_cvtps2udq512_mask( + (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundps_epu32(__m512i __W, __mmask16 __U, __m512 __A, + const int __R) { + return (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)__A, (__v16si)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundps_epu32(__mmask16 __U, __m512 __A, const int __R) { + return (__m512i)__builtin_ia32_cvtps2udq512_mask( + (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); +} +#else +#define _mm512_cvt_roundps_epi32(A, B) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask( \ + A, (__v16si)_mm512_undefined_epi32(), -1, B)) + +#define _mm512_mask_cvt_roundps_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvt_roundps_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask( \ + A, (__v16si)_mm512_setzero_si512(), U, B)) + +#define _mm512_cvt_roundps_epu32(A, B) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask( \ + A, (__v16si)_mm512_undefined_epi32(), -1, B)) + +#define _mm512_mask_cvt_roundps_epu32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvt_roundps_epu32(U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask( \ + A, (__v16si)_mm512_setzero_si512(), U, B)) +#endif + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtu32_sd(__m128d __A, unsigned __B) { + return (__m128d)__builtin_ia32_cvtusi2sd32((__v2df)__A, __B); +} + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundu64_sd(__m128d __A, unsigned long long __B, const int __R) { + return (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)__A, __B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundi64_sd(__m128d __A, long long __B, const int __R) { + return (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)__A, __B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundsi64_sd(__m128d __A, long long __B, const int __R) { + return (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)__A, __B, __R); +} +#else +#define _mm_cvt_roundu64_sd(A, B, C) \ + (__m128d) __builtin_ia32_cvtusi2sd64(A, B, C) + +#define _mm_cvt_roundi64_sd(A, B, C) \ + (__m128d) __builtin_ia32_cvtsi2sd64(A, B, C) + +#define _mm_cvt_roundsi64_sd(A, B, C) \ + (__m128d) __builtin_ia32_cvtsi2sd64(A, B, C) +#endif + +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundu32_ss(__m128 __A, unsigned __B, const int __R) { + return (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)__A, __B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundsi32_ss(__m128 __A, int __B, const int __R) { + return (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)__A, __B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundi32_ss(__m128 __A, int __B, const int __R) { + return (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)__A, __B, __R); +} +#else +#define _mm_cvt_roundu32_ss(A, B, C) \ + (__m128) __builtin_ia32_cvtusi2ss32(A, B, C) + +#define _mm_cvt_roundi32_ss(A, B, C) (__m128) __builtin_ia32_cvtsi2ss32(A, B, C) + +#define _mm_cvt_roundsi32_ss(A, B, C) \ + (__m128) __builtin_ia32_cvtsi2ss32(A, B, C) +#endif + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundu64_ss(__m128 __A, unsigned long long __B, const int __R) { + return (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)__A, __B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundsi64_ss(__m128 __A, long long __B, const int __R) { + return (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)__A, __B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundi64_ss(__m128 __A, long long __B, const int __R) { + return (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)__A, __B, __R); +} +#else +#define _mm_cvt_roundu64_ss(A, B, C) \ + (__m128) __builtin_ia32_cvtusi2ss64(A, B, C) + +#define _mm_cvt_roundi64_ss(A, B, C) (__m128) __builtin_ia32_cvtsi2ss64(A, B, C) + +#define _mm_cvt_roundsi64_ss(A, B, C) \ + (__m128) __builtin_ia32_cvtsi2ss64(A, B, C) +#endif + +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi32_epi8(__m512i __A) { + return (__m128i)__builtin_ia32_pmovdb512_mask( + (__v16si)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi32_storeu_epi8(void *__P, __mmask16 __M, __m512i __A) { + __builtin_ia32_pmovdb512mem_mask((__v16qi *)__P, (__v16si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovdb512_mask((__v16si)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi32_epi8(__mmask16 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovdb512_mask( + (__v16si)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtsepi32_epi8(__m512i __A) { + return (__m128i)__builtin_ia32_pmovsdb512_mask( + (__v16si)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi32_storeu_epi8(void *__P, __mmask16 __M, __m512i __A) { + __builtin_ia32_pmovsdb512mem_mask((__v16qi *)__P, (__v16si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovsdb512_mask((__v16si)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtsepi32_epi8(__mmask16 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovsdb512_mask( + (__v16si)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtusepi32_epi8(__m512i __A) { + return (__m128i)__builtin_ia32_pmovusdb512_mask( + (__v16si)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi32_storeu_epi8(void *__P, __mmask16 __M, __m512i __A) { + __builtin_ia32_pmovusdb512mem_mask((__v16qi *)__P, (__v16si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovusdb512_mask((__v16si)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtusepi32_epi8(__mmask16 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovusdb512_mask( + (__v16si)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi32_epi16(__m512i __A) { + return (__m256i)__builtin_ia32_pmovdw512_mask( + (__v16si)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) { + __builtin_ia32_pmovdw512mem_mask((__v16hi *)__P, (__v16si)__A, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovdw512_mask((__v16si)__A, (__v16hi)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi32_epi16(__mmask16 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovdw512_mask( + (__v16si)__A, (__v16hi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtsepi32_epi16(__m512i __A) { + return (__m256i)__builtin_ia32_pmovsdw512_mask( + (__v16si)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) { + __builtin_ia32_pmovsdw512mem_mask((__v16hi *)__P, (__v16si)__A, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovsdw512_mask((__v16si)__A, (__v16hi)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtsepi32_epi16(__mmask16 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovsdw512_mask( + (__v16si)__A, (__v16hi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtusepi32_epi16(__m512i __A) { + return (__m256i)__builtin_ia32_pmovusdw512_mask( + (__v16si)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) { + __builtin_ia32_pmovusdw512mem_mask((__v16hi *)__P, (__v16si)__A, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovusdw512_mask((__v16si)__A, (__v16hi)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtusepi32_epi16(__mmask16 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovusdw512_mask( + (__v16si)__A, (__v16hi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi64_epi32(__m512i __A) { + return (__m256i)__builtin_ia32_pmovqd512_mask( + (__v8di)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, __m512i __A) { + __builtin_ia32_pmovqd512mem_mask((__v8si *)__P, (__v8di)__A, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovqd512_mask((__v8di)__A, (__v8si)__O, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi64_epi32(__mmask8 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovqd512_mask( + (__v8di)__A, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtsepi64_epi32(__m512i __A) { + return (__m256i)__builtin_ia32_pmovsqd512_mask( + (__v8di)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, __m512i __A) { + __builtin_ia32_pmovsqd512mem_mask((__v8si *)__P, (__v8di)__A, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovsqd512_mask((__v8di)__A, (__v8si)__O, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtsepi64_epi32(__mmask8 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovsqd512_mask( + (__v8di)__A, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtusepi64_epi32(__m512i __A) { + return (__m256i)__builtin_ia32_pmovusqd512_mask( + (__v8di)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, __m512i __A) { + __builtin_ia32_pmovusqd512mem_mask((__v8si *)__P, (__v8di)__A, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovusqd512_mask((__v8di)__A, (__v8si)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtusepi64_epi32(__mmask8 __M, __m512i __A) { + return (__m256i)__builtin_ia32_pmovusqd512_mask( + (__v8di)__A, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi64_epi16(__m512i __A) { + return (__m128i)__builtin_ia32_pmovqw512_mask( + (__v8di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) { + __builtin_ia32_pmovqw512mem_mask((__v8hi *)__P, (__v8di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovqw512_mask((__v8di)__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi64_epi16(__mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovqw512_mask( + (__v8di)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtsepi64_epi16(__m512i __A) { + return (__m128i)__builtin_ia32_pmovsqw512_mask( + (__v8di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) { + __builtin_ia32_pmovsqw512mem_mask((__v8hi *)__P, (__v8di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovsqw512_mask((__v8di)__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtsepi64_epi16(__mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovsqw512_mask( + (__v8di)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtusepi64_epi16(__m512i __A) { + return (__m128i)__builtin_ia32_pmovusqw512_mask( + (__v8di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) { + __builtin_ia32_pmovusqw512mem_mask((__v8hi *)__P, (__v8di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovusqw512_mask((__v8di)__A, (__v8hi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtusepi64_epi16(__mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovusqw512_mask( + (__v8di)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi64_epi8(__m512i __A) { + return (__m128i)__builtin_ia32_pmovqb512_mask( + (__v8di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, __m512i __A) { + __builtin_ia32_pmovqb512mem_mask((__v16qi *)__P, (__v8di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovqb512_mask((__v8di)__A, (__v16qi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi64_epi8(__mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovqb512_mask( + (__v8di)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtsepi64_epi8(__m512i __A) { + return (__m128i)__builtin_ia32_pmovsqb512_mask( + (__v8di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, __m512i __A) { + __builtin_ia32_pmovsqb512mem_mask((__v16qi *)__P, (__v8di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovsqb512_mask((__v8di)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtsepi64_epi8(__mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovsqb512_mask( + (__v8di)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtusepi64_epi8(__m512i __A) { + return (__m128i)__builtin_ia32_pmovusqb512_mask( + (__v8di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, __m512i __A) { + __builtin_ia32_pmovusqb512mem_mask((__v16qi *)__P, (__v8di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovusqb512_mask((__v8di)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtusepi64_epi8(__mmask8 __M, __m512i __A) { + return (__m128i)__builtin_ia32_pmovusqb512_mask( + (__v8di)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi32_pd(__m256i __A) { + return (__m512d)__builtin_ia32_cvtdq2pd512_mask( + (__v8si)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi32_pd(__m512d __W, __mmask8 __U, __m256i __A) { + return (__m512d)__builtin_ia32_cvtdq2pd512_mask((__v8si)__A, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi32_pd(__mmask8 __U, __m256i __A) { + return (__m512d)__builtin_ia32_cvtdq2pd512_mask( + (__v8si)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu32_pd(__m256i __A) { + return (__m512d)__builtin_ia32_cvtudq2pd512_mask( + (__v8si)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepu32_pd(__m512d __W, __mmask8 __U, __m256i __A) { + return (__m512d)__builtin_ia32_cvtudq2pd512_mask((__v8si)__A, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepu32_pd(__mmask8 __U, __m256i __A) { + return (__m512d)__builtin_ia32_cvtudq2pd512_mask( + (__v8si)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundepi32_ps(__m512i __A, const int __R) { + return (__m512)__builtin_ia32_cvtdq2ps512_mask( + (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundepi32_ps(__m512 __W, __mmask16 __U, __m512i __A, + const int __R) { + return (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)__A, (__v16sf)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundepi32_ps(__mmask16 __U, __m512i __A, const int __R) { + return (__m512)__builtin_ia32_cvtdq2ps512_mask( + (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundepu32_ps(__m512i __A, const int __R) { + return (__m512)__builtin_ia32_cvtudq2ps512_mask( + (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundepu32_ps(__m512 __W, __mmask16 __U, __m512i __A, + const int __R) { + return (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)__A, (__v16sf)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundepu32_ps(__mmask16 __U, __m512i __A, const int __R) { + return (__m512)__builtin_ia32_cvtudq2ps512_mask( + (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +} + +#else +#define _mm512_cvt_roundepi32_ps(A, B) \ + (__m512) __builtin_ia32_cvtdq2ps512_mask( \ + (__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B) + +#define _mm512_mask_cvt_roundepi32_ps(W, U, A, B) \ + (__m512) __builtin_ia32_cvtdq2ps512_mask((__v16si)(A), W, U, B) + +#define _mm512_maskz_cvt_roundepi32_ps(U, A, B) \ + (__m512) __builtin_ia32_cvtdq2ps512_mask((__v16si)(A), \ + (__v16sf)_mm512_setzero_ps(), U, B) + +#define _mm512_cvt_roundepu32_ps(A, B) \ + (__m512) __builtin_ia32_cvtudq2ps512_mask( \ + (__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B) + +#define _mm512_mask_cvt_roundepu32_ps(W, U, A, B) \ + (__m512) __builtin_ia32_cvtudq2ps512_mask((__v16si)(A), W, U, B) + +#define _mm512_maskz_cvt_roundepu32_ps(U, A, B) \ + (__m512) __builtin_ia32_cvtudq2ps512_mask( \ + (__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_extractf64x4_pd(__m512d __A, const int __imm) { + return (__m256d)__builtin_ia32_extractf64x4_mask( + (__v8df)__A, __imm, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_extractf64x4_pd(__m256d __W, __mmask8 __U, __m512d __A, + const int __imm) { + return (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)__A, __imm, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_extractf64x4_pd(__mmask8 __U, __m512d __A, const int __imm) { + return (__m256d)__builtin_ia32_extractf64x4_mask( + (__v8df)__A, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_extractf32x4_ps(__m512 __A, const int __imm) { + return (__m128)__builtin_ia32_extractf32x4_mask( + (__v16sf)__A, __imm, (__v4sf)_mm_undefined_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m512 __A, + const int __imm) { + return (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)__A, __imm, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_extractf32x4_ps(__mmask8 __U, __m512 __A, const int __imm) { + return (__m128)__builtin_ia32_extractf32x4_mask( + (__v16sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_extracti64x4_epi64(__m512i __A, const int __imm) { + return (__m256i)__builtin_ia32_extracti64x4_mask( + (__v8di)__A, __imm, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A, + const int __imm) { + return (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)__A, __imm, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A, const int __imm) { + return (__m256i)__builtin_ia32_extracti64x4_mask( + (__v8di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_extracti32x4_epi32(__m512i __A, const int __imm) { + return (__m128i)__builtin_ia32_extracti32x4_mask( + (__v16si)__A, __imm, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A, + const int __imm) { + return (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)__A, __imm, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A, const int __imm) { + return (__m128i)__builtin_ia32_extracti32x4_mask( + (__v16si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} +#else + +#define _mm512_extractf64x4_pd(X, C) \ + ((__m256d)__builtin_ia32_extractf64x4_mask( \ + (__v8df)(__m512d)(X), (int)(C), (__v4df)(__m256d)_mm256_undefined_pd(), \ + (__mmask8)-1)) + +#define _mm512_mask_extractf64x4_pd(W, U, X, C) \ + ((__m256d)__builtin_ia32_extractf64x4_mask( \ + (__v8df)(__m512d)(X), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm512_maskz_extractf64x4_pd(U, X, C) \ + ((__m256d)__builtin_ia32_extractf64x4_mask( \ + (__v8df)(__m512d)(X), (int)(C), (__v4df)(__m256d)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_extractf32x4_ps(X, C) \ + ((__m128)__builtin_ia32_extractf32x4_mask( \ + (__v16sf)(__m512)(X), (int)(C), (__v4sf)(__m128)_mm_undefined_ps(), \ + (__mmask8)-1)) + +#define _mm512_mask_extractf32x4_ps(W, U, X, C) \ + ((__m128)__builtin_ia32_extractf32x4_mask( \ + (__v16sf)(__m512)(X), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm512_maskz_extractf32x4_ps(U, X, C) \ + ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(X), (int)(C), \ + (__v4sf)(__m128)_mm_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm512_extracti64x4_epi64(X, C) \ + ((__m256i)__builtin_ia32_extracti64x4_mask( \ + (__v8di)(__m512i)(X), (int)(C), \ + (__v4di)(__m256i)_mm256_undefined_si256(), (__mmask8)-1)) + +#define _mm512_mask_extracti64x4_epi64(W, U, X, C) \ + ((__m256i)__builtin_ia32_extracti64x4_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm512_maskz_extracti64x4_epi64(U, X, C) \ + ((__m256i)__builtin_ia32_extracti64x4_mask( \ + (__v8di)(__m512i)(X), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm512_extracti32x4_epi32(X, C) \ + ((__m128i)__builtin_ia32_extracti32x4_mask( \ + (__v16si)(__m512i)(X), (int)(C), (__v4si)(__m128i)_mm_undefined_si128(), \ + (__mmask8)-1)) + +#define _mm512_mask_extracti32x4_epi32(W, U, X, C) \ + ((__m128i)__builtin_ia32_extracti32x4_mask( \ + (__v16si)(__m512i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm512_maskz_extracti32x4_epi32(U, X, C) \ + ((__m128i)__builtin_ia32_extracti32x4_mask( \ + (__v16si)(__m512i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_inserti32x4(__m512i __A, __m128i __B, const int __imm) { + return (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)__A, (__v4si)__B, + __imm, (__v16si)__A, -1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_insertf32x4(__m512 __A, __m128 __B, const int __imm) { + return (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)__A, (__v4sf)__B, + __imm, (__v16sf)__A, -1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_inserti64x4(__m512i __A, __m256i __B, const int __imm) { + return (__m512i)__builtin_ia32_inserti64x4_mask( + (__v8di)__A, (__v4di)__B, __imm, (__v8di)_mm512_undefined_epi32(), + (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_inserti64x4(__m512i __W, __mmask8 __U, __m512i __A, __m256i __B, + const int __imm) { + return (__m512i)__builtin_ia32_inserti64x4_mask( + (__v8di)__A, (__v4di)__B, __imm, (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_inserti64x4(__mmask8 __U, __m512i __A, __m256i __B, + const int __imm) { + return (__m512i)__builtin_ia32_inserti64x4_mask( + (__v8di)__A, (__v4di)__B, __imm, (__v8di)_mm512_setzero_si512(), + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_insertf64x4(__m512d __A, __m256d __B, const int __imm) { + return (__m512d)__builtin_ia32_insertf64x4_mask( + (__v8df)__A, (__v4df)__B, __imm, (__v8df)_mm512_undefined_pd(), + (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_insertf64x4(__m512d __W, __mmask8 __U, __m512d __A, __m256d __B, + const int __imm) { + return (__m512d)__builtin_ia32_insertf64x4_mask( + (__v8df)__A, (__v4df)__B, __imm, (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_insertf64x4(__mmask8 __U, __m512d __A, __m256d __B, + const int __imm) { + return (__m512d)__builtin_ia32_insertf64x4_mask( + (__v8df)__A, (__v4df)__B, __imm, (__v8df)_mm512_setzero_pd(), + (__mmask8)__U); +} +#else +#define _mm512_insertf32x4(X, Y, C) \ + ((__m512)__builtin_ia32_insertf32x4_mask( \ + (__v16sf)(__m512)(X), (__v4sf)(__m128)(Y), (int)(C), \ + (__v16sf)(__m512)(X), (__mmask16)(-1))) + +#define _mm512_inserti32x4(X, Y, C) \ + ((__m512i)__builtin_ia32_inserti32x4_mask( \ + (__v16si)(__m512i)(X), (__v4si)(__m128i)(Y), (int)(C), \ + (__v16si)(__m512i)(X), (__mmask16)(-1))) + +#define _mm512_insertf64x4(X, Y, C) \ + ((__m512d)__builtin_ia32_insertf64x4_mask( \ + (__v8df)(__m512d)(X), (__v4df)(__m256d)(Y), (int)(C), \ + (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) + +#define _mm512_mask_insertf64x4(W, U, X, Y, C) \ + ((__m512d)__builtin_ia32_insertf64x4_mask( \ + (__v8df)(__m512d)(X), (__v4df)(__m256d)(Y), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U))) + +#define _mm512_maskz_insertf64x4(U, X, Y, C) \ + ((__m512d)__builtin_ia32_insertf64x4_mask( \ + (__v8df)(__m512d)(X), (__v4df)(__m256d)(Y), (int)(C), \ + (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) + +#define _mm512_inserti64x4(X, Y, C) \ + ((__m512i)__builtin_ia32_inserti64x4_mask( \ + (__v8di)(__m512i)(X), (__v4di)(__m256i)(Y), (int)(C), \ + (__v8di)(__m512i)_mm512_undefined_epi32(), (__mmask8)-1)) + +#define _mm512_mask_inserti64x4(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_inserti64x4_mask( \ + (__v8di)(__m512i)(X), (__v4di)(__m256i)(Y), (int)(C), \ + (__v8di)(__m512i)(W), (__mmask8)(U))) + +#define _mm512_maskz_inserti64x4(U, X, Y, C) \ + ((__m512i)__builtin_ia32_inserti64x4_mask( \ + (__v8di)(__m512i)(X), (__v4di)(__m256i)(Y), (int)(C), \ + (__v8di)(__m512i)_mm512_setzero_si512(), (__mmask8)(U))) +#endif + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_loadu_pd(void const *__P) { + return *(__m512d_u *)__P; +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_loadu_pd(__m512d __W, __mmask8 __U, void const *__P) { + return (__m512d)__builtin_ia32_loadupd512_mask((const double *)__P, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) { + return (__m512d)__builtin_ia32_loadupd512_mask( + (const double *)__P, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_storeu_pd(void *__P, __m512d __A) { + *(__m512d_u *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) { + __builtin_ia32_storeupd512_mask((double *)__P, (__v8df)__A, (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_loadu_ps(void const *__P) { + return *(__m512_u *)__P; +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_loadu_ps(__m512 __W, __mmask16 __U, void const *__P) { + return (__m512)__builtin_ia32_loadups512_mask((const float *)__P, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) { + return (__m512)__builtin_ia32_loadups512_mask( + (const float *)__P, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_storeu_ps(void *__P, __m512 __A) { + *(__m512_u *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) { + __builtin_ia32_storeups512_mask((float *)__P, (__v16sf)__A, (__mmask16)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_load_ss(__m128 __W, __mmask8 __U, const float *__P) { + return (__m128)__builtin_ia32_loadss_mask(__P, (__v4sf)__W, __U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_load_ss(__mmask8 __U, const float *__P) { + return (__m128)__builtin_ia32_loadss_mask(__P, (__v4sf)_mm_setzero_ps(), __U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_load_sd(__m128d __W, __mmask8 __U, const double *__P) { + return (__m128d)__builtin_ia32_loadsd_mask(__P, (__v2df)__W, __U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_load_sd(__mmask8 __U, const double *__P) { + return (__m128d)__builtin_ia32_loadsd_mask(__P, (__v2df)_mm_setzero_pd(), + __U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_move_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_movess_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, __U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_move_ss(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_movess_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)_mm_setzero_ps(), __U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_move_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_movesd_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, __U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_move_sd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_movesd_mask((__v2df)__A, (__v2df)__B, + (__v2df)_mm_setzero_pd(), __U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_store_ss(float *__P, __mmask8 __U, __m128 __A) { + __builtin_ia32_storess_mask(__P, (__v4sf)__A, (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_store_sd(double *__P, __mmask8 __U, __m128d __A) { + __builtin_ia32_storesd_mask(__P, (__v2df)__A, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_loadu_epi64(__m512i __W, __mmask8 __U, void const *__P) { + return (__m512i)__builtin_ia32_loaddqudi512_mask((const long long *)__P, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) { + return (__m512i)__builtin_ia32_loaddqudi512_mask( + (const long long *)__P, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) { + __builtin_ia32_storedqudi512_mask((long long *)__P, (__v8di)__A, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_loadu_si512(void const *__P) { + return *(__m512i_u *)__P; +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_loadu_epi32(__m512i __W, __mmask16 __U, void const *__P) { + return (__m512i)__builtin_ia32_loaddqusi512_mask( + (const int *)__P, (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) { + return (__m512i)__builtin_ia32_loaddqusi512_mask( + (const int *)__P, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_storeu_si512(void *__P, __m512i __A) { + *(__m512i_u *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) { + __builtin_ia32_storedqusi512_mask((int *)__P, (__v16si)__A, (__mmask16)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutevar_pd(__m512d __A, __m512i __C) { + return (__m512d)__builtin_ia32_vpermilvarpd512_mask( + (__v8df)__A, (__v8di)__C, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) { + return (__m512d)__builtin_ia32_vpermilvarpd512_mask( + (__v8df)__A, (__v8di)__C, (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) { + return (__m512d)__builtin_ia32_vpermilvarpd512_mask( + (__v8df)__A, (__v8di)__C, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutevar_ps(__m512 __A, __m512i __C) { + return (__m512)__builtin_ia32_vpermilvarps512_mask( + (__v16sf)__A, (__v16si)__C, (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) { + return (__m512)__builtin_ia32_vpermilvarps512_mask( + (__v16sf)__A, (__v16si)__C, (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) { + return (__m512)__builtin_ia32_vpermilvarps512_mask( + (__v16sf)__A, (__v16si)__C, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2varq512_mask((__v8di)__I + /* idx */, + (__v8di)__A, (__v8di)__B, + (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2varq512_mask((__v8di)__I + /* idx */, + (__v8di)__A, (__v8di)__B, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermi2varq512_mask((__v8di)__A, + (__v8di)__I + /* idx */, + (__v8di)__B, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2varq512_maskz((__v8di)__I + /* idx */, + (__v8di)__A, (__v8di)__B, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2vard512_mask((__v16si)__I + /* idx */, + (__v16si)__A, (__v16si)__B, + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2vard512_mask((__v16si)__I + /* idx */, + (__v16si)__A, (__v16si)__B, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermi2vard512_mask((__v16si)__A, + (__v16si)__I + /* idx */, + (__v16si)__B, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2vard512_maskz( + (__v16si)__I + /* idx */, + (__v16si)__A, (__v16si)__B, (__mmask16)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) { + return (__m512d)__builtin_ia32_vpermt2varpd512_mask((__v8di)__I + /* idx */, + (__v8df)__A, (__v8df)__B, + (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, + __m512d __B) { + return (__m512d)__builtin_ia32_vpermt2varpd512_mask((__v8di)__I + /* idx */, + (__v8df)__A, (__v8df)__B, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, + __m512d __B) { + return (__m512d)__builtin_ia32_vpermi2varpd512_mask((__v8df)__A, + (__v8di)__I + /* idx */, + (__v8df)__B, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, + __m512d __B) { + return (__m512d)__builtin_ia32_vpermt2varpd512_maskz((__v8di)__I + /* idx */, + (__v8df)__A, (__v8df)__B, + (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) { + return (__m512)__builtin_ia32_vpermt2varps512_mask((__v16si)__I + /* idx */, + (__v16sf)__A, (__v16sf)__B, + (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, + __m512 __B) { + return (__m512)__builtin_ia32_vpermt2varps512_mask((__v16si)__I + /* idx */, + (__v16sf)__A, (__v16sf)__B, + (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, + __m512 __B) { + return (__m512)__builtin_ia32_vpermi2varps512_mask((__v16sf)__A, + (__v16si)__I + /* idx */, + (__v16sf)__B, + (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, + __m512 __B) { + return (__m512)__builtin_ia32_vpermt2varps512_maskz( + (__v16si)__I + /* idx */, + (__v16sf)__A, (__v16sf)__B, (__mmask16)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permute_pd(__m512d __X, const int __C) { + return (__m512d)__builtin_ia32_vpermilpd512_mask( + (__v8df)__X, __C, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X, const int __C) { + return (__m512d)__builtin_ia32_vpermilpd512_mask((__v8df)__X, __C, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permute_pd(__mmask8 __U, __m512d __X, const int __C) { + return (__m512d)__builtin_ia32_vpermilpd512_mask( + (__v8df)__X, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permute_ps(__m512 __X, const int __C) { + return (__m512)__builtin_ia32_vpermilps512_mask( + (__v16sf)__X, __C, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X, const int __C) { + return (__m512)__builtin_ia32_vpermilps512_mask((__v16sf)__X, __C, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permute_ps(__mmask16 __U, __m512 __X, const int __C) { + return (__m512)__builtin_ia32_vpermilps512_mask( + (__v16sf)__X, __C, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} +#else +#define _mm512_permute_pd(X, C) \ + ((__m512d)__builtin_ia32_vpermilpd512_mask( \ + (__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)_mm512_undefined_pd(), \ + (__mmask8)(-1))) + +#define _mm512_mask_permute_pd(W, U, X, C) \ + ((__m512d)__builtin_ia32_vpermilpd512_mask( \ + (__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)(W), (__mmask8)(U))) + +#define _mm512_maskz_permute_pd(U, X, C) \ + ((__m512d)__builtin_ia32_vpermilpd512_mask( \ + (__v8df)(__m512d)(X), (int)(C), (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_permute_ps(X, C) \ + ((__m512)__builtin_ia32_vpermilps512_mask( \ + (__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)_mm512_undefined_ps(), \ + (__mmask16)(-1))) + +#define _mm512_mask_permute_ps(W, U, X, C) \ + ((__m512)__builtin_ia32_vpermilps512_mask( \ + (__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)(W), (__mmask16)(U))) + +#define _mm512_maskz_permute_ps(U, X, C) \ + ((__m512)__builtin_ia32_vpermilps512_mask( \ + (__v16sf)(__m512)(X), (int)(C), (__v16sf)(__m512)_mm512_setzero_ps(), \ + (__mmask16)(U))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutex_epi64(__m512i __X, const int __I) { + return (__m512i)__builtin_ia32_permdi512_mask( + (__v8di)__X, __I, (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1)); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutex_epi64(__m512i __W, __mmask8 __M, __m512i __X, + const int __I) { + return (__m512i)__builtin_ia32_permdi512_mask((__v8di)__X, __I, (__v8di)__W, + (__mmask8)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutex_epi64(__mmask8 __M, __m512i __X, const int __I) { + return (__m512i)__builtin_ia32_permdi512_mask( + (__v8di)__X, __I, (__v8di)_mm512_setzero_si512(), (__mmask8)__M); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutex_pd(__m512d __X, const int __M) { + return (__m512d)__builtin_ia32_permdf512_mask( + (__v8df)__X, __M, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_permutex_pd(__m512d __W, __mmask8 __U, __m512d __X, const int __M) { + return (__m512d)__builtin_ia32_permdf512_mask((__v8df)__X, __M, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutex_pd(__mmask8 __U, __m512d __X, const int __M) { + return (__m512d)__builtin_ia32_permdf512_mask( + (__v8df)__X, __M, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} +#else +#define _mm512_permutex_pd(X, M) \ + ((__m512d)__builtin_ia32_permdf512_mask( \ + (__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)_mm512_undefined_pd(), \ + (__mmask8)-1)) + +#define _mm512_mask_permutex_pd(W, U, X, M) \ + ((__m512d)__builtin_ia32_permdf512_mask( \ + (__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)(W), (__mmask8)(U))) + +#define _mm512_maskz_permutex_pd(U, X, M) \ + ((__m512d)__builtin_ia32_permdf512_mask( \ + (__v8df)(__m512d)(X), (int)(M), (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_permutex_epi64(X, I) \ + ((__m512i)__builtin_ia32_permdi512_mask( \ + (__v8di)(__m512i)(X), (int)(I), \ + (__v8di)(__m512i)(_mm512_undefined_epi32()), (__mmask8)(-1))) + +#define _mm512_maskz_permutex_epi64(M, X, I) \ + ((__m512i)__builtin_ia32_permdi512_mask( \ + (__v8di)(__m512i)(X), (int)(I), \ + (__v8di)(__m512i)(_mm512_setzero_si512()), (__mmask8)(M))) + +#define _mm512_mask_permutex_epi64(W, M, X, I) \ + ((__m512i)__builtin_ia32_permdi512_mask( \ + (__v8di)(__m512i)(X), (int)(I), (__v8di)(__m512i)(W), (__mmask8)(M))) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_permvardi512_mask( + (__v8di)__Y, (__v8di)__X, (__v8di)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutexvar_epi64(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_permvardi512_mask( + (__v8di)__Y, (__v8di)__X, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) { + return (__m512i)__builtin_ia32_permvardi512_mask((__v8di)__Y, (__v8di)__X, + (__v8di)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_permvarsi512_mask( + (__v16si)__Y, (__v16si)__X, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutexvar_epi32(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_permvarsi512_mask( + (__v16si)__Y, (__v16si)__X, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, __m512i __X, + __m512i __Y) { + return (__m512i)__builtin_ia32_permvarsi512_mask((__v16si)__Y, (__v16si)__X, + (__v16si)__W, __M); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutexvar_pd(__m512i __X, __m512d __Y) { + return (__m512d)__builtin_ia32_permvardf512_mask( + (__v8df)__Y, (__v8di)__X, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, __m512i __X, + __m512d __Y) { + return (__m512d)__builtin_ia32_permvardf512_mask((__v8df)__Y, (__v8di)__X, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, __m512d __Y) { + return (__m512d)__builtin_ia32_permvardf512_mask( + (__v8df)__Y, (__v8di)__X, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutexvar_ps(__m512i __X, __m512 __Y) { + return (__m512)__builtin_ia32_permvarsf512_mask( + (__v16sf)__Y, (__v16si)__X, (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) { + return (__m512)__builtin_ia32_permvarsf512_mask((__v16sf)__Y, (__v16si)__X, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, __m512 __Y) { + return (__m512)__builtin_ia32_permvarsf512_mask( + (__v16sf)__Y, (__v16si)__X, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shuffle_ps(__m512 __M, __m512 __V, const int __imm) { + return (__m512)__builtin_ia32_shufps512_mask( + (__v16sf)__M, (__v16sf)__V, __imm, (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shuffle_ps(__m512 __W, __mmask16 __U, __m512 __M, __m512 __V, + const int __imm) { + return (__m512)__builtin_ia32_shufps512_mask( + (__v16sf)__M, (__v16sf)__V, __imm, (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V, + const int __imm) { + return (__m512)__builtin_ia32_shufps512_mask( + (__v16sf)__M, (__v16sf)__V, __imm, (__v16sf)_mm512_setzero_ps(), + (__mmask16)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shuffle_pd(__m512d __M, __m512d __V, const int __imm) { + return (__m512d)__builtin_ia32_shufpd512_mask((__v8df)__M, (__v8df)__V, __imm, + (__v8df)_mm512_undefined_pd(), + (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shuffle_pd(__m512d __W, __mmask8 __U, __m512d __M, __m512d __V, + const int __imm) { + return (__m512d)__builtin_ia32_shufpd512_mask((__v8df)__M, (__v8df)__V, __imm, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shuffle_pd(__mmask8 __U, __m512d __M, __m512d __V, + const int __imm) { + return (__m512d)__builtin_ia32_shufpd512_mask((__v8df)__M, (__v8df)__V, __imm, + (__v8df)_mm512_setzero_pd(), + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fixupimm_round_pd(__m512d __A, __m512d __B, __m512i __C, + const int __imm, const int __R) { + return (__m512d)__builtin_ia32_fixupimmpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fixupimm_round_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512i __C, const int __imm, const int __R) { + return (__m512d)__builtin_ia32_fixupimmpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fixupimm_round_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm, + const int __R) { + return (__m512d)__builtin_ia32_fixupimmpd512_maskz( + (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fixupimm_round_ps(__m512 __A, __m512 __B, __m512i __C, + const int __imm, const int __R) { + return (__m512)__builtin_ia32_fixupimmps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fixupimm_round_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512i __C, const int __imm, const int __R) { + return (__m512)__builtin_ia32_fixupimmps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fixupimm_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512i __C, const int __imm, + const int __R) { + return (__m512)__builtin_ia32_fixupimmps512_maskz( + (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fixupimm_round_sd(__m128d __A, __m128d __B, __m128i __C, + const int __imm, const int __R) { + return (__m128d)__builtin_ia32_fixupimmsd_mask( + (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)-1, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fixupimm_round_sd(__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm, const int __R) { + return (__m128d)__builtin_ia32_fixupimmsd_mask( + (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fixupimm_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm, const int __R) { + return (__m128d)__builtin_ia32_fixupimmsd_maskz( + (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fixupimm_round_ss(__m128 __A, __m128 __B, __m128i __C, const int __imm, + const int __R) { + return (__m128)__builtin_ia32_fixupimmss_mask( + (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)-1, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fixupimm_round_ss(__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm, const int __R) { + return (__m128)__builtin_ia32_fixupimmss_mask( + (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fixupimm_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm, const int __R) { + return (__m128)__builtin_ia32_fixupimmss_maskz( + (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, __R); +} + +#else +#define _mm512_shuffle_pd(X, Y, C) \ + ((__m512d)__builtin_ia32_shufpd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ + (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1)) + +#define _mm512_mask_shuffle_pd(W, U, X, Y, C) \ + ((__m512d)__builtin_ia32_shufpd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U))) + +#define _mm512_maskz_shuffle_pd(U, X, Y, C) \ + ((__m512d)__builtin_ia32_shufpd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(C), \ + (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U))) + +#define _mm512_shuffle_ps(X, Y, C) \ + ((__m512)__builtin_ia32_shufps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ + (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1)) + +#define _mm512_mask_shuffle_ps(W, U, X, Y, C) \ + ((__m512)__builtin_ia32_shufps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U))) + +#define _mm512_maskz_shuffle_ps(U, X, Y, C) \ + ((__m512)__builtin_ia32_shufps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(C), \ + (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) + +#define _mm512_fixupimm_round_pd(X, Y, Z, C, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ + (int)(C), (__mmask8)(-1), (R))) + +#define _mm512_mask_fixupimm_round_pd(X, U, Y, Z, C, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ + (int)(C), (__mmask8)(U), (R))) + +#define _mm512_maskz_fixupimm_round_pd(U, X, Y, Z, C, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_maskz( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ + (int)(C), (__mmask8)(U), (R))) + +#define _mm512_fixupimm_round_ps(X, Y, Z, C, R) \ + ((__m512)__builtin_ia32_fixupimmps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ + (int)(C), (__mmask16)(-1), (R))) + +#define _mm512_mask_fixupimm_round_ps(X, U, Y, Z, C, R) \ + ((__m512)__builtin_ia32_fixupimmps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ + (int)(C), (__mmask16)(U), (R))) + +#define _mm512_maskz_fixupimm_round_ps(U, X, Y, Z, C, R) \ + ((__m512)__builtin_ia32_fixupimmps512_maskz( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ + (int)(C), (__mmask16)(U), (R))) + +#define _mm_fixupimm_round_sd(X, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ + (int)(C), (__mmask8)(-1), (R))) + +#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ + (int)(C), (__mmask8)(U), (R))) + +#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ + (int)(C), (__mmask8)(U), (R))) + +#define _mm_fixupimm_round_ss(X, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_mask( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ + (int)(C), (__mmask8)(-1), (R))) + +#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_mask( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ + (int)(C), (__mmask8)(U), (R))) + +#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_maskz( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ + (int)(C), (__mmask8)(U), (R))) +#endif + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_movehdup_ps(__m512 __A) { + return (__m512)__builtin_ia32_movshdup512_mask( + (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_movshdup512_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_movshdup512_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_moveldup_ps(__m512 __A) { + return (__m512)__builtin_ia32_movsldup512_mask( + (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_movsldup512_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_movsldup512_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_or_si512(__m512i __A, __m512i __B) { + return (__m512i)((__v16su)__A | (__v16su)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_or_epi32(__m512i __A, __m512i __B) { + return (__m512i)((__v16su)__A | (__v16su)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_or_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pord512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_or_epi32(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pord512_mask((__v16si)__A, (__v16si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_or_epi64(__m512i __A, __m512i __B) { + return (__m512i)((__v8du)__A | (__v8du)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_or_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_porq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_or_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_porq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_xor_si512(__m512i __A, __m512i __B) { + return (__m512i)((__v16su)__A ^ (__v16su)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_xor_epi32(__m512i __A, __m512i __B) { + return (__m512i)((__v16su)__A ^ (__v16su)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_xor_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pxord512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_xor_epi32(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pxord512_mask((__v16si)__A, (__v16si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_xor_epi64(__m512i __A, __m512i __B) { + return (__m512i)((__v8du)__A ^ (__v8du)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_xor_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pxorq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_xor_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pxorq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rol_epi32(__m512i __A, const int __B) { + return (__m512i)__builtin_ia32_prold512_mask( + (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_rol_epi32(__m512i __W, __mmask16 __U, __m512i __A, const int __B) { + return (__m512i)__builtin_ia32_prold512_mask((__v16si)__A, __B, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rol_epi32(__mmask16 __U, __m512i __A, const int __B) { + return (__m512i)__builtin_ia32_prold512_mask( + (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_ror_epi32(__m512i __A, int __B) { + return (__m512i)__builtin_ia32_prord512_mask( + (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_ror_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) { + return (__m512i)__builtin_ia32_prord512_mask((__v16si)__A, __B, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_ror_epi32(__mmask16 __U, __m512i __A, int __B) { + return (__m512i)__builtin_ia32_prord512_mask( + (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_rol_epi64(__m512i __A, const int __B) { + return (__m512i)__builtin_ia32_prolq512_mask( + (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_rol_epi64(__m512i __W, __mmask8 __U, __m512i __A, const int __B) { + return (__m512i)__builtin_ia32_prolq512_mask((__v8di)__A, __B, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_rol_epi64(__mmask8 __U, __m512i __A, const int __B) { + return (__m512i)__builtin_ia32_prolq512_mask( + (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_ror_epi64(__m512i __A, int __B) { + return (__m512i)__builtin_ia32_prorq512_mask( + (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_ror_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) { + return (__m512i)__builtin_ia32_prorq512_mask((__v8di)__A, __B, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_ror_epi64(__mmask8 __U, __m512i __A, int __B) { + return (__m512i)__builtin_ia32_prorq512_mask( + (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +#else +#define _mm512_rol_epi32(A, B) \ + ((__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(A), (int)(B), \ + (__v16si)_mm512_undefined_epi32(), \ + (__mmask16)(-1))) +#define _mm512_mask_rol_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_prold512_mask( \ + (__v16si)(__m512i)(A), (int)(B), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_rol_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(A), (int)(B), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U))) +#define _mm512_ror_epi32(A, B) \ + ((__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \ + (__v16si)_mm512_undefined_epi32(), \ + (__mmask16)(-1))) +#define _mm512_mask_ror_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_prord512_mask( \ + (__v16si)(__m512i)(A), (int)(B), (__v16si)(__m512i)(W), (__mmask16)(U))) +#define _mm512_maskz_ror_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U))) +#define _mm512_rol_epi64(A, B) \ + ((__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(A), (int)(B), \ + (__v8di)_mm512_undefined_epi32(), \ + (__mmask8)(-1))) +#define _mm512_mask_rol_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(A), (int)(B), \ + (__v8di)(__m512i)(W), (__mmask8)(U))) +#define _mm512_maskz_rol_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(A), (int)(B), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U))) + +#define _mm512_ror_epi64(A, B) \ + ((__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \ + (__v8di)_mm512_undefined_epi32(), \ + (__mmask8)(-1))) +#define _mm512_mask_ror_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \ + (__v8di)(__m512i)(W), (__mmask8)(U))) +#define _mm512_maskz_ror_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U))) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_and_si512(__m512i __A, __m512i __B) { + return (__m512i)((__v16su)__A & (__v16su)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_and_epi32(__m512i __A, __m512i __B) { + return (__m512i)((__v16su)__A & (__v16su)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_and_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_and_epi32(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_and_epi64(__m512i __A, __m512i __B) { + return (__m512i)((__v8du)__A & (__v8du)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_and_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, __U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_and_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_pd(), __U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_andnot_si512(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandnd512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_andnot_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandnd512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandnd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandnd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_andnot_epi64(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandnq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandnq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, __U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pandnq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_pd(), __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_test_epi32_mask(__m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_ptestmd512((__v16si)__A, (__v16si)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_test_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_ptestmd512((__v16si)__A, (__v16si)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_test_epi64_mask(__m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_ptestmq512((__v8di)__A, (__v8di)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_test_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_ptestmq512((__v8di)__A, (__v8di)__B, __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_testn_epi32_mask(__m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_ptestnmd512((__v16si)__A, (__v16si)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_testn_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_ptestnmd512((__v16si)__A, (__v16si)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_testn_epi64_mask(__m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_ptestnmq512((__v8di)__A, (__v8di)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_testn_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_ptestnmq512((__v8di)__A, (__v8di)__B, __U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_abs_ps(__m512 __A) { + return (__m512)_mm512_and_epi32((__m512i)__A, _mm512_set1_epi32(0x7fffffff)); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_abs_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)_mm512_mask_and_epi32((__m512i)__W, __U, (__m512i)__A, + _mm512_set1_epi32(0x7fffffff)); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_abs_pd(__m512d __A) { + return (__m512d)_mm512_and_epi64((__m512i)__A, + _mm512_set1_epi64(0x7fffffffffffffffLL)); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_abs_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)_mm512_mask_and_epi64( + (__m512i)__W, __U, (__m512i)__A, _mm512_set1_epi64(0x7fffffffffffffffLL)); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpackhi_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpckhdq512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_punpckhdq512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpckhdq512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpackhi_epi64(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpckhqdq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_punpckhqdq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpckhqdq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpacklo_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpckldq512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_punpckldq512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpckldq512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpacklo_epi64(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpcklqdq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_unpacklo_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_punpcklqdq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpacklo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_punpcklqdq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundss_u64(__m128 __A, const int __R) { + return (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)__A, __R); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundss_si64(__m128 __A, const int __R) { + return (long long)__builtin_ia32_vcvtss2si64((__v4sf)__A, __R); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundss_i64(__m128 __A, const int __R) { + return (long long)__builtin_ia32_vcvtss2si64((__v4sf)__A, __R); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundss_u64(__m128 __A, const int __R) { + return (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)__A, __R); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundss_i64(__m128 __A, const int __R) { + return (long long)__builtin_ia32_vcvttss2si64((__v4sf)__A, __R); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundss_si64(__m128 __A, const int __R) { + return (long long)__builtin_ia32_vcvttss2si64((__v4sf)__A, __R); +} +#else +#define _mm_cvt_roundss_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B)) + +#define _mm_cvt_roundss_si64(A, B) ((long long)__builtin_ia32_vcvtss2si64(A, B)) + +#define _mm_cvt_roundss_i64(A, B) ((long long)__builtin_ia32_vcvtss2si64(A, B)) + +#define _mm_cvtt_roundss_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B)) + +#define _mm_cvtt_roundss_i64(A, B) \ + ((long long)__builtin_ia32_vcvttss2si64(A, B)) + +#define _mm_cvtt_roundss_si64(A, B) \ + ((long long)__builtin_ia32_vcvttss2si64(A, B)) +#endif +#endif + +#ifdef __OPTIMIZE__ +extern __inline unsigned + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundss_u32(__m128 __A, const int __R) { + return (unsigned)__builtin_ia32_vcvtss2usi32((__v4sf)__A, __R); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundss_si32(__m128 __A, const int __R) { + return (int)__builtin_ia32_vcvtss2si32((__v4sf)__A, __R); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundss_i32(__m128 __A, const int __R) { + return (int)__builtin_ia32_vcvtss2si32((__v4sf)__A, __R); +} + +extern __inline unsigned + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundss_u32(__m128 __A, const int __R) { + return (unsigned)__builtin_ia32_vcvttss2usi32((__v4sf)__A, __R); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundss_i32(__m128 __A, const int __R) { + return (int)__builtin_ia32_vcvttss2si32((__v4sf)__A, __R); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundss_si32(__m128 __A, const int __R) { + return (int)__builtin_ia32_vcvttss2si32((__v4sf)__A, __R); +} +#else +#define _mm_cvt_roundss_u32(A, B) ((unsigned)__builtin_ia32_vcvtss2usi32(A, B)) + +#define _mm_cvt_roundss_si32(A, B) ((int)__builtin_ia32_vcvtss2si32(A, B)) + +#define _mm_cvt_roundss_i32(A, B) ((int)__builtin_ia32_vcvtss2si32(A, B)) + +#define _mm_cvtt_roundss_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvttss2usi32(A, B)) + +#define _mm_cvtt_roundss_si32(A, B) ((int)__builtin_ia32_vcvttss2si32(A, B)) + +#define _mm_cvtt_roundss_i32(A, B) ((int)__builtin_ia32_vcvttss2si32(A, B)) +#endif + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundsd_u64(__m128d __A, const int __R) { + return (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)__A, __R); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundsd_si64(__m128d __A, const int __R) { + return (long long)__builtin_ia32_vcvtsd2si64((__v2df)__A, __R); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundsd_i64(__m128d __A, const int __R) { + return (long long)__builtin_ia32_vcvtsd2si64((__v2df)__A, __R); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundsd_u64(__m128d __A, const int __R) { + return (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)__A, __R); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundsd_si64(__m128d __A, const int __R) { + return (long long)__builtin_ia32_vcvttsd2si64((__v2df)__A, __R); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundsd_i64(__m128d __A, const int __R) { + return (long long)__builtin_ia32_vcvttsd2si64((__v2df)__A, __R); +} +#else +#define _mm_cvt_roundsd_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B)) + +#define _mm_cvt_roundsd_si64(A, B) ((long long)__builtin_ia32_vcvtsd2si64(A, B)) + +#define _mm_cvt_roundsd_i64(A, B) ((long long)__builtin_ia32_vcvtsd2si64(A, B)) + +#define _mm_cvtt_roundsd_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B)) + +#define _mm_cvtt_roundsd_si64(A, B) \ + ((long long)__builtin_ia32_vcvttsd2si64(A, B)) + +#define _mm_cvtt_roundsd_i64(A, B) \ + ((long long)__builtin_ia32_vcvttsd2si64(A, B)) +#endif +#endif + +#ifdef __OPTIMIZE__ +extern __inline unsigned + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundsd_u32(__m128d __A, const int __R) { + return (unsigned)__builtin_ia32_vcvtsd2usi32((__v2df)__A, __R); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundsd_si32(__m128d __A, const int __R) { + return (int)__builtin_ia32_vcvtsd2si32((__v2df)__A, __R); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundsd_i32(__m128d __A, const int __R) { + return (int)__builtin_ia32_vcvtsd2si32((__v2df)__A, __R); +} + +extern __inline unsigned + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundsd_u32(__m128d __A, const int __R) { + return (unsigned)__builtin_ia32_vcvttsd2usi32((__v2df)__A, __R); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundsd_i32(__m128d __A, const int __R) { + return (int)__builtin_ia32_vcvttsd2si32((__v2df)__A, __R); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_roundsd_si32(__m128d __A, const int __R) { + return (int)__builtin_ia32_vcvttsd2si32((__v2df)__A, __R); +} +#else +#define _mm_cvt_roundsd_u32(A, B) ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B)) + +#define _mm_cvt_roundsd_si32(A, B) ((int)__builtin_ia32_vcvtsd2si32(A, B)) + +#define _mm_cvt_roundsd_i32(A, B) ((int)__builtin_ia32_vcvtsd2si32(A, B)) + +#define _mm_cvtt_roundsd_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B)) + +#define _mm_cvtt_roundsd_si32(A, B) ((int)__builtin_ia32_vcvttsd2si32(A, B)) + +#define _mm_cvtt_roundsd_i32(A, B) ((int)__builtin_ia32_vcvttsd2si32(A, B)) +#endif + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_movedup_pd(__m512d __A) { + return (__m512d)__builtin_ia32_movddup512_mask( + (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_movedup_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_movddup512_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_movedup_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_movddup512_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpacklo_pd(__m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_unpcklpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_unpcklpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpacklo_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_unpcklpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpackhi_pd(__m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_unpckhpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_unpckhpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_unpckhpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpackhi_ps(__m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_unpckhps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_unpckhps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpackhi_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_unpckhps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundps_pd(__m256 __A, const int __R) { + return (__m512d)__builtin_ia32_cvtps2pd512_mask( + (__v8sf)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundps_pd(__m512d __W, __mmask8 __U, __m256 __A, + const int __R) { + return (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)__A, (__v8df)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundps_pd(__mmask8 __U, __m256 __A, const int __R) { + return (__m512d)__builtin_ia32_cvtps2pd512_mask( + (__v8sf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundph_ps(__m256i __A, const int __R) { + return (__m512)__builtin_ia32_vcvtph2ps512_mask( + (__v16hi)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundph_ps(__m512 __W, __mmask16 __U, __m256i __A, + const int __R) { + return (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)__A, (__v16sf)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundph_ps(__mmask16 __U, __m256i __A, const int __R) { + return (__m512)__builtin_ia32_vcvtph2ps512_mask( + (__v16hi)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundps_ph(__m512 __A, const int __I) { + return (__m256i)__builtin_ia32_vcvtps2ph512_mask( + (__v16sf)__A, __I, (__v16hi)_mm256_undefined_si256(), -1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtps_ph(__m512 __A, const int __I) { + return (__m256i)__builtin_ia32_vcvtps2ph512_mask( + (__v16sf)__A, __I, (__v16hi)_mm256_undefined_si256(), -1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundps_ph(__m256i __U, __mmask16 __W, __m512 __A, + const int __I) { + return (__m256i)__builtin_ia32_vcvtps2ph512_mask( + (__v16sf)__A, __I, (__v16hi)__U, (__mmask16)__W); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_cvtps_ph(__m256i __U, __mmask16 __W, __m512 __A, const int __I) { + return (__m256i)__builtin_ia32_vcvtps2ph512_mask( + (__v16sf)__A, __I, (__v16hi)__U, (__mmask16)__W); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundps_ph(__mmask16 __W, __m512 __A, const int __I) { + return (__m256i)__builtin_ia32_vcvtps2ph512_mask( + (__v16sf)__A, __I, (__v16hi)_mm256_setzero_si256(), (__mmask16)__W); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtps_ph(__mmask16 __W, __m512 __A, const int __I) { + return (__m256i)__builtin_ia32_vcvtps2ph512_mask( + (__v16sf)__A, __I, (__v16hi)_mm256_setzero_si256(), (__mmask16)__W); +} +#else +#define _mm512_cvt_roundps_pd(A, B) \ + (__m512d) \ + __builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, B) + +#define _mm512_mask_cvt_roundps_pd(W, U, A, B) \ + (__m512d) __builtin_ia32_cvtps2pd512_mask(A, (__v8df)(W), U, B) + +#define _mm512_maskz_cvt_roundps_pd(U, A, B) \ + (__m512d) \ + __builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), U, B) + +#define _mm512_cvt_roundph_ps(A, B) \ + (__m512) __builtin_ia32_vcvtph2ps512_mask( \ + (__v16hi)(A), (__v16sf)_mm512_undefined_ps(), -1, B) + +#define _mm512_mask_cvt_roundph_ps(W, U, A, B) \ + (__m512) __builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)(W), U, B) + +#define _mm512_maskz_cvt_roundph_ps(U, A, B) \ + (__m512) __builtin_ia32_vcvtph2ps512_mask( \ + (__v16hi)(A), (__v16sf)_mm512_setzero_ps(), U, B) + +#define _mm512_cvt_roundps_ph(A, I) \ + ((__m256i)__builtin_ia32_vcvtps2ph512_mask( \ + (__v16sf)(__m512)A, (int)(I), (__v16hi)_mm256_undefined_si256(), -1)) +#define _mm512_cvtps_ph(A, I) \ + ((__m256i)__builtin_ia32_vcvtps2ph512_mask( \ + (__v16sf)(__m512)A, (int)(I), (__v16hi)_mm256_undefined_si256(), -1)) +#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ + ((__m256i)__builtin_ia32_vcvtps2ph512_mask( \ + (__v16sf)(__m512)A, (int)(I), (__v16hi)(__m256i)(U), (__mmask16)(W))) +#define _mm512_mask_cvtps_ph(U, W, A, I) \ + ((__m256i)__builtin_ia32_vcvtps2ph512_mask( \ + (__v16sf)(__m512)A, (int)(I), (__v16hi)(__m256i)(U), (__mmask16)(W))) +#define _mm512_maskz_cvt_roundps_ph(W, A, I) \ + ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)A, (int)(I), \ + (__v16hi)_mm256_setzero_si256(), \ + (__mmask16)(W))) +#define _mm512_maskz_cvtps_ph(W, A, I) \ + ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)A, (int)(I), \ + (__v16hi)_mm256_setzero_si256(), \ + (__mmask16)(W))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvt_roundpd_ps(__m512d __A, const int __R) { + return (__m256)__builtin_ia32_cvtpd2ps512_mask( + (__v8df)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, __R); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvt_roundpd_ps(__m256 __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)__A, (__v8sf)__W, + (__mmask8)__U, __R); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvt_roundpd_ps(__mmask8 __U, __m512d __A, const int __R) { + return (__m256)__builtin_ia32_cvtpd2ps512_mask( + (__v8df)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundsd_ss(__m128 __A, __m128d __B, const int __R) { + return (__m128)__builtin_ia32_cvtsd2ss_round((__v4sf)__A, (__v2df)__B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_roundss_sd(__m128d __A, __m128 __B, const int __R) { + return (__m128d)__builtin_ia32_cvtss2sd_round((__v2df)__A, (__v4sf)__B, __R); +} +#else +#define _mm512_cvt_roundpd_ps(A, B) \ + (__m256) \ + __builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B) + +#define _mm512_mask_cvt_roundpd_ps(W, U, A, B) \ + (__m256) __builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)(W), U, B) + +#define _mm512_maskz_cvt_roundpd_ps(U, A, B) \ + (__m256) __builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B) + +#define _mm_cvt_roundsd_ss(A, B, C) \ + (__m128) __builtin_ia32_cvtsd2ss_round(A, B, C) + +#define _mm_cvt_roundss_sd(A, B, C) \ + (__m128d) __builtin_ia32_cvtss2sd_round(A, B, C) +#endif + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_stream_si512(__m512i *__P, __m512i __A) { + __builtin_ia32_movntdq512((__v8di *)__P, (__v8di)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_stream_ps(float *__P, __m512 __A) { + __builtin_ia32_movntps512(__P, (__v16sf)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_stream_pd(double *__P, __m512d __A) { + __builtin_ia32_movntpd512(__P, (__v8df)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_stream_load_si512(void *__P) { + return __builtin_ia32_movntdqa512((__v8di *)__P); +} + +typedef enum { + _MM_MANT_NORM_1_2, + _MM_MANT_NORM_p5_2, + _MM_MANT_NORM_p5_1, + _MM_MANT_NORM_p75_1p5 +} _MM_MANTISSA_NORM_ENUM; + +typedef enum { + _MM_MANT_SIGN_src, + _MM_MANT_SIGN_zero, + _MM_MANT_SIGN_nan +} _MM_MANTISSA_SIGN_ENUM; + +#ifdef __OPTIMIZE__ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getexp_round_ss(__m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_getexpss128_round((__v4sf)__A, (__v4sf)__B, + __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getexp_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_getexpss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_getexp_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_getexpss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getexp_round_sd(__m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_getexpsd128_round((__v2df)__A, (__v2df)__B, + __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getexp_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_getexpsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getexp_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_getexpsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_getexp_round_ps(__m512 __A, const int __R) { + return (__m512)__builtin_ia32_getexpps512_mask( + (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_getexp_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + const int __R) { + return (__m512)__builtin_ia32_getexpps512_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_getexp_round_ps(__mmask16 __U, __m512 __A, const int __R) { + return (__m512)__builtin_ia32_getexpps512_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_getexp_round_pd(__m512d __A, const int __R) { + return (__m512d)__builtin_ia32_getexppd512_mask( + (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_getexp_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + const int __R) { + return (__m512d)__builtin_ia32_getexppd512_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_getexp_round_pd(__mmask8 __U, __m512d __A, const int __R) { + return (__m512d)__builtin_ia32_getexppd512_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_getmant_round_pd(__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { + return (__m512d)__builtin_ia32_getmantpd512_mask( + (__v8df)__A, (__C << 2) | __B, _mm512_undefined_pd(), (__mmask8)-1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_getmant_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { + return (__m512d)__builtin_ia32_getmantpd512_mask( + (__v8df)__A, (__C << 2) | __B, (__v8df)__W, __U, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_getmant_round_pd(__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { + return (__m512d)__builtin_ia32_getmantpd512_mask( + (__v8df)__A, (__C << 2) | __B, (__v8df)_mm512_setzero_pd(), __U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_getmant_round_ps(__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { + return (__m512)__builtin_ia32_getmantps512_mask( + (__v16sf)__A, (__C << 2) | __B, _mm512_undefined_ps(), (__mmask16)-1, + __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_getmant_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { + return (__m512)__builtin_ia32_getmantps512_mask( + (__v16sf)__A, (__C << 2) | __B, (__v16sf)__W, __U, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_getmant_round_ps(__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) { + return (__m512)__builtin_ia32_getmantps512_mask( + (__v16sf)__A, (__C << 2) | __B, (__v16sf)_mm512_setzero_ps(), __U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getmant_round_sd(__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { + return (__m128d)__builtin_ia32_getmantsd_round((__v2df)__A, (__v2df)__B, + (__D << 2) | __C, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getmant_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { + return (__m128d)__builtin_ia32_getmantsd_mask_round( + (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)__W, __U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getmant_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { + return (__m128d)__builtin_ia32_getmantsd_mask_round( + (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)_mm_setzero_pd(), __U, + __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getmant_round_ss(__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { + return (__m128)__builtin_ia32_getmantss_round((__v4sf)__A, (__v4sf)__B, + (__D << 2) | __C, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getmant_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { + return (__m128)__builtin_ia32_getmantss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)__W, __U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getmant_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { + return (__m128)__builtin_ia32_getmantss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)_mm_setzero_ps(), __U, + __R); +} + +#else +#define _mm512_getmant_round_pd(X, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask( \ + (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), \ + (__v8df)(__m512d)_mm512_undefined_pd(), (__mmask8)-1, (R))) + +#define _mm512_mask_getmant_round_pd(W, U, X, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask( \ + (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), (__v8df)(__m512d)(W), \ + (__mmask8)(U), (R))) + +#define _mm512_maskz_getmant_round_pd(U, X, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask( \ + (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), \ + (__v8df)(__m512d)_mm512_setzero_pd(), (__mmask8)(U), (R))) +#define _mm512_getmant_round_ps(X, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask( \ + (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), \ + (__v16sf)(__m512)_mm512_undefined_ps(), (__mmask16)-1, (R))) + +#define _mm512_mask_getmant_round_ps(W, U, X, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask( \ + (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), (__v16sf)(__m512)(W), \ + (__mmask16)(U), (R))) + +#define _mm512_maskz_getmant_round_ps(U, X, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask( \ + (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), \ + (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U), (R))) +#define _mm_getmant_round_sd(X, Y, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_round((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D) << 2) | (C)), (R))) + +#define _mm_mask_getmant_round_sd(W, U, X, Y, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D) << 2) | (C)), \ + (__v2df)(__m128d)(W), (__mmask8)(U), (R))) + +#define _mm_maskz_getmant_round_sd(U, X, Y, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D) << 2) | (C)), \ + (__v2df)(__m128d)_mm_setzero_pd(), (__mmask8)(U), (R))) + +#define _mm_getmant_round_ss(X, Y, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_round( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), (R))) + +#define _mm_mask_getmant_round_ss(W, U, X, Y, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_mask_round( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), \ + (__v4sf)(__m128)(W), (__mmask8)(U), (R))) + +#define _mm_maskz_getmant_round_ss(U, X, Y, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_mask_round( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), \ + (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)(U), (R))) + +#define _mm_getexp_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), R)) + +#define _mm_mask_getexp_round_ss(W, U, A, B, C) \ + (__m128) __builtin_ia32_getexpss_mask_round(A, B, W, U, C) + +#define _mm_maskz_getexp_round_ss(U, A, B, C) \ + (__m128) \ + __builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_getexp_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), R)) + +#define _mm_mask_getexp_round_sd(W, U, A, B, C) \ + (__m128d) __builtin_ia32_getexpsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_getexp_round_sd(U, A, B, C) \ + (__m128d) \ + __builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm512_getexp_round_ps(A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, R)) + +#define _mm512_mask_getexp_round_ps(W, U, A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(W), (__mmask16)(U), R)) + +#define _mm512_maskz_getexp_round_ps(U, A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), R)) + +#define _mm512_getexp_round_pd(A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)-1, R)) + +#define _mm512_mask_getexp_round_pd(W, U, A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(W), (__mmask8)(U), R)) + +#define _mm512_maskz_getexp_round_pd(U, A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), R)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_roundscale_round_ps(__m512 __A, const int __imm, const int __R) { + return (__m512)__builtin_ia32_rndscaleps_mask( + (__v16sf)__A, __imm, (__v16sf)_mm512_undefined_ps(), -1, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_roundscale_round_ps(__m512 __A, __mmask16 __B, __m512 __C, + const int __imm, const int __R) { + return (__m512)__builtin_ia32_rndscaleps_mask( + (__v16sf)__C, __imm, (__v16sf)__A, (__mmask16)__B, __R); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_roundscale_round_ps(__mmask16 __A, __m512 __B, const int __imm, + const int __R) { + return (__m512)__builtin_ia32_rndscaleps_mask( + (__v16sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), (__mmask16)__A, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_roundscale_round_pd(__m512d __A, const int __imm, const int __R) { + return (__m512d)__builtin_ia32_rndscalepd_mask( + (__v8df)__A, __imm, (__v8df)_mm512_undefined_pd(), -1, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_roundscale_round_pd(__m512d __A, __mmask8 __B, __m512d __C, + const int __imm, const int __R) { + return (__m512d)__builtin_ia32_rndscalepd_mask( + (__v8df)__C, __imm, (__v8df)__A, (__mmask8)__B, __R); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_roundscale_round_pd(__mmask8 __A, __m512d __B, const int __imm, + const int __R) { + return (__m512d)__builtin_ia32_rndscalepd_mask( + (__v8df)__B, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)__A, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roundscale_round_ss(__m128 __A, __m128 __B, const int __imm, + const int __R) { + return (__m128)__builtin_ia32_rndscaless_round((__v4sf)__A, (__v4sf)__B, + __imm, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roundscale_round_sd(__m128d __A, __m128d __B, const int __imm, + const int __R) { + return (__m128d)__builtin_ia32_rndscalesd_round((__v2df)__A, (__v2df)__B, + __imm, __R); +} + +#else +#define _mm512_roundscale_round_ps(A, B, R) \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)(-1), R)) +#define _mm512_mask_roundscale_round_ps(A, B, C, D, R) \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(D), \ + (__v16sf)(__m512)(A), \ + (__mmask16)(B), R)) +#define _mm512_maskz_roundscale_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(A), R)) +#define _mm512_roundscale_round_pd(A, B, R) \ + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)(-1), R)) +#define _mm512_mask_roundscale_round_pd(A, B, C, D, R) \ + ((__m512d)__builtin_ia32_rndscalepd_mask( \ + (__v8df)(__m512d)(C), (int)(D), (__v8df)(__m512d)(A), (__mmask8)(B), R)) +#define _mm512_maskz_roundscale_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(A), R)) +#define _mm_roundscale_round_ss(A, B, C, R) \ + ((__m128)__builtin_ia32_rndscaless_round((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), R)) +#define _mm_roundscale_round_sd(A, B, C, R) \ + ((__m128d)__builtin_ia32_rndscalesd_round( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), R)) +#endif + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_floor_ps(__m512 __A) { + return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_FLOOR, + (__v16sf)__A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_floor_pd(__m512d __A) { + return (__m512d)__builtin_ia32_rndscalepd_mask( + (__v8df)__A, _MM_FROUND_FLOOR, (__v8df)__A, -1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_ceil_ps(__m512 __A) { + return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_CEIL, + (__v16sf)__A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_ceil_pd(__m512d __A) { + return (__m512d)__builtin_ia32_rndscalepd_mask( + (__v8df)__A, _MM_FROUND_CEIL, (__v8df)__A, -1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_floor_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_FLOOR, + (__v16sf)__W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_floor_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)__A, _MM_FROUND_FLOOR, + (__v8df)__W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_ceil_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_CEIL, + (__v16sf)__W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_ceil_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_rndscalepd_mask( + (__v8df)__A, _MM_FROUND_CEIL, (__v8df)__W, __U, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_alignr_epi32(__m512i __A, __m512i __B, const int __imm) { + return (__m512i)__builtin_ia32_alignd512_mask( + (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_alignr_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B, const int __imm) { + return (__m512i)__builtin_ia32_alignd512_mask( + (__v16si)__A, (__v16si)__B, __imm, (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_alignr_epi32(__mmask16 __U, __m512i __A, __m512i __B, + const int __imm) { + return (__m512i)__builtin_ia32_alignd512_mask( + (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_setzero_si512(), + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_alignr_epi64(__m512i __A, __m512i __B, const int __imm) { + return (__m512i)__builtin_ia32_alignq512_mask( + (__v8di)__A, (__v8di)__B, __imm, (__v8di)_mm512_undefined_epi32(), + (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_alignr_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) { + return (__m512i)__builtin_ia32_alignq512_mask((__v8di)__A, (__v8di)__B, __imm, + (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_alignr_epi64(__mmask8 __U, __m512i __A, __m512i __B, + const int __imm) { + return (__m512i)__builtin_ia32_alignq512_mask((__v8di)__A, (__v8di)__B, __imm, + (__v8di)_mm512_setzero_si512(), + (__mmask8)__U); +} +#else +#define _mm512_alignr_epi32(X, Y, C) \ + ((__m512i)__builtin_ia32_alignd512_mask( \ + (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ + (__v16si)_mm512_undefined_epi32(), (__mmask16)-1)) + +#define _mm512_mask_alignr_epi32(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignd512_mask( \ + (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ + (__v16si)(__m512i)(W), (__mmask16)(U))) + +#define _mm512_maskz_alignr_epi32(U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignd512_mask( \ + (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(C), \ + (__v16si)_mm512_setzero_si512(), (__mmask16)(U))) + +#define _mm512_alignr_epi64(X, Y, C) \ + ((__m512i)__builtin_ia32_alignq512_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ + (__v8di)_mm512_undefined_epi32(), (__mmask8)-1)) + +#define _mm512_mask_alignr_epi64(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignq512_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ + (__v8di)(__m512i)(W), (__mmask8)(U))) + +#define _mm512_maskz_alignr_epi64(U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignq512_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(C), \ + (__v8di)_mm512_setzero_si512(), (__mmask8)(U))) +#endif + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpeq_epi32_mask(__m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__A, (__v16si)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpeq_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__A, (__v16si)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpeq_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__A, (__v8di)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpeq_epi64_mask(__m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__A, (__v8di)__B, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpgt_epi32_mask(__m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__A, (__v16si)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpgt_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__A, (__v16si)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpgt_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__A, (__v8di)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpgt_epi64_mask(__m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__A, (__v8di)__B, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpge_epi32_mask(__m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 5, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpge_epi32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 5, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpge_epu32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 5, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpge_epu32_mask(__m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 5, + (__mmask16)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpge_epi64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpge_epi64_mask(__m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpge_epu64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpge_epu64_mask(__m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmple_epi32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 2, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmple_epi32_mask(__m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 2, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmple_epu32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 2, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmple_epu32_mask(__m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 2, + (__mmask16)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmple_epi64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmple_epi64_mask(__m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 2, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmple_epu64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmple_epu64_mask(__m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 2, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmplt_epi32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 1, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmplt_epi32_mask(__m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 1, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmplt_epu32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 1, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmplt_epu32_mask(__m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 1, + (__mmask16)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmplt_epi64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmplt_epi64_mask(__m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmplt_epu64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmplt_epu64_mask(__m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpneq_epi32_mask(__m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 4, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpneq_epi32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 4, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpneq_epu32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 4, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpneq_epu32_mask(__m512i __X, __m512i __Y) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 4, + (__mmask16)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpneq_epi64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpneq_epi64_mask(__m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpneq_epu64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpneq_epu64_mask(__m512i __X, __m512i __Y) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 4, + (__mmask8)-1); +} + +#define _MM_CMPINT_EQ 0x0 +#define _MM_CMPINT_LT 0x1 +#define _MM_CMPINT_LE 0x2 +#define _MM_CMPINT_UNUSED 0x3 +#define _MM_CMPINT_NE 0x4 +#define _MM_CMPINT_NLT 0x5 +#define _MM_CMPINT_GE 0x5 +#define _MM_CMPINT_NLE 0x6 +#define _MM_CMPINT_GT 0x6 + +#ifdef __OPTIMIZE__ +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kshiftli_mask16(__mmask16 __A, unsigned int __B) { + return (__mmask16)__builtin_ia32_kshiftlihi((__mmask16)__A, (__mmask8)__B); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kshiftri_mask16(__mmask16 __A, unsigned int __B) { + return (__mmask16)__builtin_ia32_kshiftrihi((__mmask16)__A, (__mmask8)__B); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_epi64_mask(__m512i __X, __m512i __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_epi32_mask(__m512i __X, __m512i __Y, const int __P) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, __P, + (__mmask16)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_epu64_mask(__m512i __X, __m512i __Y, const int __P) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_epu32_mask(__m512i __X, __m512i __Y, const int __P) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, + __P, (__mmask16)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_round_pd_mask(__m512d __X, __m512d __Y, const int __P, + const int __R) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, __P, + (__mmask8)-1, __R); +} + +extern __inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_cmp_round_ps_mask(__m512 __X, __m512 __Y, const int __P, const int __R) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + __P, (__mmask16)-1, __R); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmp_epi64_mask(__mmask8 __U, __m512i __X, __m512i __Y, + const int __P) { + return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmp_epi32_mask(__mmask16 __U, __m512i __X, __m512i __Y, + const int __P) { + return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, __P, + (__mmask16)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmp_epu64_mask(__mmask8 __U, __m512i __X, __m512i __Y, + const int __P) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmp_epu32_mask(__mmask16 __U, __m512i __X, __m512i __Y, + const int __P) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, + __P, (__mmask16)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmp_round_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y, + const int __P, const int __R) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, __P, + (__mmask8)__U, __R); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmp_round_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y, + const int __P, const int __R) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + __P, (__mmask16)__U, __R); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_cmp_round_sd_mask(__m128d __X, __m128d __Y, const int __P, const int __R) { + return (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)__X, (__v2df)__Y, __P, + (__mmask8)-1, __R); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmp_round_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y, + const int __P, const int __R) { + return (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)__X, (__v2df)__Y, __P, + (__mmask8)__M, __R); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_cmp_round_ss_mask(__m128 __X, __m128 __Y, const int __P, const int __R) { + return (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)__X, (__v4sf)__Y, __P, + (__mmask8)-1, __R); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmp_round_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y, + const int __P, const int __R) { + return (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)__X, (__v4sf)__Y, __P, + (__mmask8)__M, __R); +} + +#else +#define _kshiftli_mask16(X, Y) \ + ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(X), (__mmask8)(Y))) + +#define _kshiftri_mask16(X, Y) \ + ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(X), (__mmask8)(Y))) + +#define _mm512_cmp_epi64_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpq512_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)-1)) + +#define _mm512_cmp_epi32_mask(X, Y, P) \ + ((__mmask16)__builtin_ia32_cmpd512_mask( \ + (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)-1)) + +#define _mm512_cmp_epu64_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpq512_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)-1)) + +#define _mm512_cmp_epu32_mask(X, Y, P) \ + ((__mmask16)__builtin_ia32_ucmpd512_mask( \ + (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)-1)) + +#define _mm512_cmp_round_pd_mask(X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmppd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)-1, R)) + +#define _mm512_cmp_round_ps_mask(X, Y, P, R) \ + ((__mmask16)__builtin_ia32_cmpps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)-1, R)) + +#define _mm512_mask_cmp_epi64_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpq512_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)M)) + +#define _mm512_mask_cmp_epi32_mask(M, X, Y, P) \ + ((__mmask16)__builtin_ia32_cmpd512_mask( \ + (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)M)) + +#define _mm512_mask_cmp_epu64_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpq512_mask( \ + (__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)(P), (__mmask8)M)) + +#define _mm512_mask_cmp_epu32_mask(M, X, Y, P) \ + ((__mmask16)__builtin_ia32_ucmpd512_mask( \ + (__v16si)(__m512i)(X), (__v16si)(__m512i)(Y), (int)(P), (__mmask16)M)) + +#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmppd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)M, R)) + +#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R) \ + ((__mmask16)__builtin_ia32_cmpps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)M, R)) + +#define _mm_cmp_round_sd_mask(X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpsd_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1, R)) + +#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpsd_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (M), R)) + +#define _mm_cmp_round_ss_mask(X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpss_mask( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1, R)) + +#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), (M), R)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i32gather_ps(__m512i __index, void const *__addr, int __scale) { + __m512 __v1_old = _mm512_undefined_ps(); + __mmask16 __mask = 0xFFFF; + + return (__m512)__builtin_ia32_gathersiv16sf( + (__v16sf)__v1_old, __addr, (__v16si)__index, __mask, __scale); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i32gather_ps(__m512 __v1_old, __mmask16 __mask, __m512i __index, + void const *__addr, int __scale) { + return (__m512)__builtin_ia32_gathersiv16sf( + (__v16sf)__v1_old, __addr, (__v16si)__index, __mask, __scale); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i32gather_pd(__m256i __index, void const *__addr, int __scale) { + __m512d __v1_old = _mm512_undefined_pd(); + __mmask8 __mask = 0xFF; + + return (__m512d)__builtin_ia32_gathersiv8df((__v8df)__v1_old, __addr, + (__v8si)__index, __mask, __scale); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i32gather_pd(__m512d __v1_old, __mmask8 __mask, __m256i __index, + void const *__addr, int __scale) { + return (__m512d)__builtin_ia32_gathersiv8df((__v8df)__v1_old, __addr, + (__v8si)__index, __mask, __scale); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i64gather_ps(__m512i __index, void const *__addr, int __scale) { + __m256 __v1_old = _mm256_undefined_ps(); + __mmask8 __mask = 0xFF; + + return (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)__v1_old, __addr, + (__v8di)__index, __mask, __scale); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i64gather_ps(__m256 __v1_old, __mmask8 __mask, __m512i __index, + void const *__addr, int __scale) { + return (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)__v1_old, __addr, + (__v8di)__index, __mask, __scale); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i64gather_pd(__m512i __index, void const *__addr, int __scale) { + __m512d __v1_old = _mm512_undefined_pd(); + __mmask8 __mask = 0xFF; + + return (__m512d)__builtin_ia32_gatherdiv8df((__v8df)__v1_old, __addr, + (__v8di)__index, __mask, __scale); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i64gather_pd(__m512d __v1_old, __mmask8 __mask, __m512i __index, + void const *__addr, int __scale) { + return (__m512d)__builtin_ia32_gatherdiv8df((__v8df)__v1_old, __addr, + (__v8di)__index, __mask, __scale); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i32gather_epi32(__m512i __index, void const *__addr, int __scale) { + __m512i __v1_old = _mm512_undefined_epi32(); + __mmask16 __mask = 0xFFFF; + + return (__m512i)__builtin_ia32_gathersiv16si( + (__v16si)__v1_old, __addr, (__v16si)__index, __mask, __scale); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i32gather_epi32(__m512i __v1_old, __mmask16 __mask, + __m512i __index, void const *__addr, + int __scale) { + return (__m512i)__builtin_ia32_gathersiv16si( + (__v16si)__v1_old, __addr, (__v16si)__index, __mask, __scale); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i32gather_epi64(__m256i __index, void const *__addr, int __scale) { + __m512i __v1_old = _mm512_undefined_epi32(); + __mmask8 __mask = 0xFF; + + return (__m512i)__builtin_ia32_gathersiv8di((__v8di)__v1_old, __addr, + (__v8si)__index, __mask, __scale); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { + return (__m512i)__builtin_ia32_gathersiv8di((__v8di)__v1_old, __addr, + (__v8si)__index, __mask, __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i64gather_epi32(__m512i __index, void const *__addr, int __scale) { + __m256i __v1_old = _mm256_undefined_si256(); + __mmask8 __mask = 0xFF; + + return (__m256i)__builtin_ia32_gatherdiv16si( + (__v8si)__v1_old, __addr, (__v8di)__index, __mask, __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i64gather_epi32(__m256i __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, + int __scale) { + return (__m256i)__builtin_ia32_gatherdiv16si( + (__v8si)__v1_old, __addr, (__v8di)__index, __mask, __scale); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i64gather_epi64(__m512i __index, void const *__addr, int __scale) { + __m512i __v1_old = _mm512_undefined_epi32(); + __mmask8 __mask = 0xFF; + + return (__m512i)__builtin_ia32_gatherdiv8di((__v8di)__v1_old, __addr, + (__v8di)__index, __mask, __scale); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i64gather_epi64(__m512i __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, + int __scale) { + return (__m512i)__builtin_ia32_gatherdiv8di((__v8di)__v1_old, __addr, + (__v8di)__index, __mask, __scale); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_i32scatter_ps(void *__addr, __m512i __index, __m512 __v1, int __scale) { + __builtin_ia32_scattersiv16sf(__addr, (__mmask16)0xFFFF, (__v16si)__index, + (__v16sf)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i32scatter_ps(void *__addr, __mmask16 __mask, __m512i __index, + __m512 __v1, int __scale) { + __builtin_ia32_scattersiv16sf(__addr, __mask, (__v16si)__index, (__v16sf)__v1, + __scale); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_i32scatter_pd(void *__addr, __m256i __index, __m512d __v1, int __scale) { + __builtin_ia32_scattersiv8df(__addr, (__mmask8)0xFF, (__v8si)__index, + (__v8df)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i32scatter_pd(void *__addr, __mmask8 __mask, __m256i __index, + __m512d __v1, int __scale) { + __builtin_ia32_scattersiv8df(__addr, __mask, (__v8si)__index, (__v8df)__v1, + __scale); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_i64scatter_ps(void *__addr, __m512i __index, __m256 __v1, int __scale) { + __builtin_ia32_scatterdiv16sf(__addr, (__mmask8)0xFF, (__v8di)__index, + (__v8sf)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i64scatter_ps(void *__addr, __mmask8 __mask, __m512i __index, + __m256 __v1, int __scale) { + __builtin_ia32_scatterdiv16sf(__addr, __mask, (__v8di)__index, (__v8sf)__v1, + __scale); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_i64scatter_pd(void *__addr, __m512i __index, __m512d __v1, int __scale) { + __builtin_ia32_scatterdiv8df(__addr, (__mmask8)0xFF, (__v8di)__index, + (__v8df)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i64scatter_pd(void *__addr, __mmask8 __mask, __m512i __index, + __m512d __v1, int __scale) { + __builtin_ia32_scatterdiv8df(__addr, __mask, (__v8di)__index, (__v8df)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i32scatter_epi32(void *__addr, __m512i __index, __m512i __v1, + int __scale) { + __builtin_ia32_scattersiv16si(__addr, (__mmask16)0xFFFF, (__v16si)__index, + (__v16si)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i32scatter_epi32(void *__addr, __mmask16 __mask, + __m512i __index, __m512i __v1, int __scale) { + __builtin_ia32_scattersiv16si(__addr, __mask, (__v16si)__index, (__v16si)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i32scatter_epi64(void *__addr, __m256i __index, __m512i __v1, + int __scale) { + __builtin_ia32_scattersiv8di(__addr, (__mmask8)0xFF, (__v8si)__index, + (__v8di)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, __m256i __index, + __m512i __v1, int __scale) { + __builtin_ia32_scattersiv8di(__addr, __mask, (__v8si)__index, (__v8di)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i64scatter_epi32(void *__addr, __m512i __index, __m256i __v1, + int __scale) { + __builtin_ia32_scatterdiv16si(__addr, (__mmask8)0xFF, (__v8di)__index, + (__v8si)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m512i __index, + __m256i __v1, int __scale) { + __builtin_ia32_scatterdiv16si(__addr, __mask, (__v8di)__index, (__v8si)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_i64scatter_epi64(void *__addr, __m512i __index, __m512i __v1, + int __scale) { + __builtin_ia32_scatterdiv8di(__addr, (__mmask8)0xFF, (__v8di)__index, + (__v8di)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, __m512i __index, + __m512i __v1, int __scale) { + __builtin_ia32_scatterdiv8di(__addr, __mask, (__v8di)__index, (__v8di)__v1, + __scale); +} +#else +#define _mm512_i32gather_ps(INDEX, ADDR, SCALE) \ + (__m512) __builtin_ia32_gathersiv16sf( \ + (__v16sf)_mm512_undefined_ps(), (void const *)ADDR, \ + (__v16si)(__m512i)INDEX, (__mmask16)0xFFFF, (int)SCALE) + +#define _mm512_mask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512) __builtin_ia32_gathersiv16sf( \ + (__v16sf)(__m512)V1OLD, (void const *)ADDR, (__v16si)(__m512i)INDEX, \ + (__mmask16)MASK, (int)SCALE) + +#define _mm512_i32gather_pd(INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gathersiv8df( \ + (__v8df)_mm512_undefined_pd(), (void const *)ADDR, \ + (__v8si)(__m256i)INDEX, (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gathersiv8df( \ + (__v8df)(__m512d)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i64gather_ps(INDEX, ADDR, SCALE) \ + (__m256) __builtin_ia32_gatherdiv16sf( \ + (__v8sf)_mm256_undefined_ps(), (void const *)ADDR, \ + (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256) __builtin_ia32_gatherdiv16sf( \ + (__v8sf)(__m256)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i64gather_pd(INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gatherdiv8df( \ + (__v8df)_mm512_undefined_pd(), (void const *)ADDR, \ + (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gatherdiv8df( \ + (__v8df)(__m512d)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i32gather_epi32(INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv16si( \ + (__v16si)_mm512_undefined_epi32(), (void const *)ADDR, \ + (__v16si)(__m512i)INDEX, (__mmask16)0xFFFF, (int)SCALE) + +#define _mm512_mask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv16si( \ + (__v16si)(__m512i)V1OLD, (void const *)ADDR, (__v16si)(__m512i)INDEX, \ + (__mmask16)MASK, (int)SCALE) + +#define _mm512_i32gather_epi64(INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv8di( \ + (__v8di)_mm512_undefined_epi32(), (void const *)ADDR, \ + (__v8si)(__m256i)INDEX, (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv8di( \ + (__v8di)(__m512i)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i64gather_epi32(INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv16si( \ + (__v8si)_mm256_undefined_si256(), (void const *)ADDR, \ + (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv16si( \ + (__v8si)(__m256i)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i64gather_epi64(INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gatherdiv8di( \ + (__v8di)_mm512_undefined_epi32(), (void const *)ADDR, \ + (__v8di)(__m512i)INDEX, (__mmask8)0xFF, (int)SCALE) + +#define _mm512_mask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gatherdiv8di( \ + (__v8di)(__m512i)V1OLD, (void const *)ADDR, (__v8di)(__m512i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm512_i32scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16sf((void *)ADDR, (__mmask16)0xFFFF, \ + (__v16si)(__m512i)INDEX, (__v16sf)(__m512)V1, \ + (int)SCALE) + +#define _mm512_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16sf((void *)ADDR, (__mmask16)MASK, \ + (__v16si)(__m512i)INDEX, (__v16sf)(__m512)V1, \ + (int)SCALE) + +#define _mm512_i32scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8df((void *)ADDR, (__mmask8)0xFF, \ + (__v8si)(__m256i)INDEX, (__v8df)(__m512d)V1, \ + (int)SCALE) + +#define _mm512_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8df((void *)ADDR, (__mmask8)MASK, \ + (__v8si)(__m256i)INDEX, (__v8df)(__m512d)V1, \ + (int)SCALE) + +#define _mm512_i64scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16sf((void *)ADDR, (__mmask8)0xFF, \ + (__v8di)(__m512i)INDEX, (__v8sf)(__m256)V1, \ + (int)SCALE) + +#define _mm512_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16sf((void *)ADDR, (__mmask16)MASK, \ + (__v8di)(__m512i)INDEX, (__v8sf)(__m256)V1, \ + (int)SCALE) + +#define _mm512_i64scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8df((void *)ADDR, (__mmask8)0xFF, \ + (__v8di)(__m512i)INDEX, (__v8df)(__m512d)V1, \ + (int)SCALE) + +#define _mm512_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8df((void *)ADDR, (__mmask8)MASK, \ + (__v8di)(__m512i)INDEX, (__v8df)(__m512d)V1, \ + (int)SCALE) + +#define _mm512_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16si((void *)ADDR, (__mmask16)0xFFFF, \ + (__v16si)(__m512i)INDEX, (__v16si)(__m512i)V1, \ + (int)SCALE) + +#define _mm512_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16si((void *)ADDR, (__mmask16)MASK, \ + (__v16si)(__m512i)INDEX, (__v16si)(__m512i)V1, \ + (int)SCALE) + +#define _mm512_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8di((void *)ADDR, (__mmask8)0xFF, \ + (__v8si)(__m256i)INDEX, (__v8di)(__m512i)V1, \ + (int)SCALE) + +#define _mm512_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8di((void *)ADDR, (__mmask8)MASK, \ + (__v8si)(__m256i)INDEX, (__v8di)(__m512i)V1, \ + (int)SCALE) + +#define _mm512_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16si((void *)ADDR, (__mmask8)0xFF, \ + (__v8di)(__m512i)INDEX, (__v8si)(__m256i)V1, \ + (int)SCALE) + +#define _mm512_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16si((void *)ADDR, (__mmask8)MASK, \ + (__v8di)(__m512i)INDEX, (__v8si)(__m256i)V1, \ + (int)SCALE) + +#define _mm512_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8di((void *)ADDR, (__mmask8)0xFF, \ + (__v8di)(__m512i)INDEX, (__v8di)(__m512i)V1, \ + (int)SCALE) + +#define _mm512_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8di((void *)ADDR, (__mmask8)MASK, \ + (__v8di)(__m512i)INDEX, (__v8di)(__m512i)V1, \ + (int)SCALE) +#endif + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compress_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_compressdf512_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_compress_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_compressdf512_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m512d __A) { + __builtin_ia32_compressstoredf512_mask((__v8df *)__P, (__v8df)__A, + (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compress_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_compresssf512_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_compress_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_compresssf512_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compressstoreu_ps(void *__P, __mmask16 __U, __m512 __A) { + __builtin_ia32_compressstoresf512_mask((__v16sf *)__P, (__v16sf)__A, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compress_epi64(__m512i __W, __mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_compressdi512_mask((__v8di)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_compress_epi64(__mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_compressdi512_mask( + (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m512i __A) { + __builtin_ia32_compressstoredi512_mask((__v8di *)__P, (__v8di)__A, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compress_epi32(__m512i __W, __mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_compresssi512_mask((__v16si)__A, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_compress_epi32(__mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_compresssi512_mask( + (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compressstoreu_epi32(void *__P, __mmask16 __U, __m512i __A) { + __builtin_ia32_compressstoresi512_mask((__v16si *)__P, (__v16si)__A, + (__mmask16)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expand_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_expanddf512_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expand_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_expanddf512_maskz( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) { + return (__m512d)__builtin_ia32_expandloaddf512_mask( + (const __v8df *)__P, (__v8df)__W, (__mmask8)__U); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { + return (__m512d)__builtin_ia32_expandloaddf512_maskz( + (const __v8df *)__P, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expand_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_expandsf512_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expand_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_expandsf512_maskz( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) { + return (__m512)__builtin_ia32_expandloadsf512_mask( + (const __v16sf *)__P, (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) { + return (__m512)__builtin_ia32_expandloadsf512_maskz( + (const __v16sf *)__P, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expand_epi64(__m512i __W, __mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_expanddi512_mask((__v8di)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expand_epi64(__mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_expanddi512_maskz( + (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) { + return (__m512i)__builtin_ia32_expandloaddi512_mask( + (const __v8di *)__P, (__v8di)__W, (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { + return (__m512i)__builtin_ia32_expandloaddi512_maskz( + (const __v8di *)__P, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expand_epi32(__m512i __W, __mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_expandsi512_mask((__v16si)__A, (__v16si)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expand_epi32(__mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_expandsi512_maskz( + (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) { + return (__m512i)__builtin_ia32_expandloadsi512_mask( + (const __v16si *)__P, (__v16si)__W, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) { + return (__m512i)__builtin_ia32_expandloadsi512_maskz( + (const __v16si *)__P, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +/* Mask arithmetic operations */ +#define _kand_mask16 _mm512_kand +#define _kandn_mask16 _mm512_kandn +#define _knot_mask16 _mm512_knot +#define _kor_mask16 _mm512_kor +#define _kxnor_mask16 _mm512_kxnor +#define _kxor_mask16 _mm512_kxor + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__CF) { + *__CF = (unsigned char)__builtin_ia32_kortestchi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) { + return (unsigned char)__builtin_ia32_kortestzhi((__mmask16)__A, + (__mmask16)__B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) { + return (unsigned char)__builtin_ia32_kortestchi((__mmask16)__A, + (__mmask16)__B); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cvtmask16_u32(__mmask16 __A) { + return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cvtu32_mask16(unsigned int __A) { + return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _load_mask16(__mmask16 *__A) { + return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _store_mask16(__mmask16 *__A, __mmask16 __B) { + *(__mmask16 *)__A = __builtin_ia32_kmovw(__B); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kand(__mmask16 __A, __mmask16 __B) { + return (__mmask16)__builtin_ia32_kandhi((__mmask16)__A, (__mmask16)__B); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kandn(__mmask16 __A, __mmask16 __B) { + return (__mmask16)__builtin_ia32_kandnhi((__mmask16)__A, (__mmask16)__B); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kor(__mmask16 __A, __mmask16 __B) { + return (__mmask16)__builtin_ia32_korhi((__mmask16)__A, (__mmask16)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kortestz(__mmask16 __A, __mmask16 __B) { + return (__mmask16)__builtin_ia32_kortestzhi((__mmask16)__A, (__mmask16)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kortestc(__mmask16 __A, __mmask16 __B) { + return (__mmask16)__builtin_ia32_kortestchi((__mmask16)__A, (__mmask16)__B); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kxnor(__mmask16 __A, __mmask16 __B) { + return (__mmask16)__builtin_ia32_kxnorhi((__mmask16)__A, (__mmask16)__B); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kxor(__mmask16 __A, __mmask16 __B) { + return (__mmask16)__builtin_ia32_kxorhi((__mmask16)__A, (__mmask16)__B); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_knot(__mmask16 __A) { + return (__mmask16)__builtin_ia32_knothi((__mmask16)__A); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kunpackb(__mmask16 __A, __mmask16 __B) { + return (__mmask16)__builtin_ia32_kunpckhi((__mmask16)__A, (__mmask16)__B); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _kunpackb_mask16(__mmask8 __A, __mmask8 __B) { + return (__mmask16)__builtin_ia32_kunpckhi((__mmask16)__A, (__mmask16)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_inserti32x4(__mmask16 __B, __m512i __C, __m128i __D, + const int __imm) { + return (__m512i)__builtin_ia32_inserti32x4_mask( + (__v16si)__C, (__v4si)__D, __imm, (__v16si)_mm512_setzero_si512(), __B); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_insertf32x4(__mmask16 __B, __m512 __C, __m128 __D, + const int __imm) { + return (__m512)__builtin_ia32_insertf32x4_mask( + (__v16sf)__C, (__v4sf)__D, __imm, (__v16sf)_mm512_setzero_ps(), __B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_inserti32x4(__m512i __A, __mmask16 __B, __m512i __C, + __m128i __D, const int __imm) { + return (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)__C, (__v4si)__D, + __imm, (__v16si)__A, __B); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_insertf32x4(__m512 __A, __mmask16 __B, __m512 __C, __m128 __D, + const int __imm) { + return (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)__C, (__v4sf)__D, + __imm, (__v16sf)__A, __B); +} +#else +#define _mm512_maskz_insertf32x4(A, X, Y, C) \ + ((__m512)__builtin_ia32_insertf32x4_mask( \ + (__v16sf)(__m512)(X), (__v4sf)(__m128)(Y), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)(A))) + +#define _mm512_maskz_inserti32x4(A, X, Y, C) \ + ((__m512i)__builtin_ia32_inserti32x4_mask( \ + (__v16si)(__m512i)(X), (__v4si)(__m128i)(Y), (int)(C), \ + (__v16si)_mm512_setzero_si512(), (__mmask16)(A))) + +#define _mm512_mask_insertf32x4(A, B, X, Y, C) \ + ((__m512)__builtin_ia32_insertf32x4_mask( \ + (__v16sf)(__m512)(X), (__v4sf)(__m128)(Y), (int)(C), \ + (__v16sf)(__m512)(A), (__mmask16)(B))) + +#define _mm512_mask_inserti32x4(A, B, X, Y, C) \ + ((__m512i)__builtin_ia32_inserti32x4_mask( \ + (__v16si)(__m512i)(X), (__v4si)(__m128i)(Y), (int)(C), \ + (__v16si)(__m512i)(A), (__mmask16)(B))) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_epi64(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_epi64(__mmask8 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_max_epi64(__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_epi64(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_min_epi64(__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_epi64(__mmask8 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_epu64(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxuq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_epu64(__mmask8 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxuq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_max_epu64(__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxuq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_epu64(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminuq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_min_epu64(__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminuq512_mask((__v8di)__A, (__v8di)__B, + (__v8di)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_epu64(__mmask8 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminuq512_mask( + (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsd512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_epi32(__mmask16 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsd512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_max_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxsd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_epi32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsd512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_epi32(__mmask16 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsd512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_min_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminsd512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_epu32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxud512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_epu32(__mmask16 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxud512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_max_epu32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaxud512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, __M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_epu32(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminud512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), + (__mmask16)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_epu32(__mmask16 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminud512_mask( + (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_min_epu32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pminud512_mask((__v16si)__A, (__v16si)__B, + (__v16si)__W, __M); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_unpacklo_ps(__m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_unpcklps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)_mm512_undefined_ps(), + (__mmask16)-1); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_unpcklps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_unpacklo_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_unpcklps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_round_sd(__m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_maxsd_round((__v2df)__A, (__v2df)__B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_maxsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_max_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_maxsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_round_ss(__m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_maxss_round((__v4sf)__A, (__v4sf)__B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_maxss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_max_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_maxss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_round_sd(__m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_minsd_round((__v2df)__A, (__v2df)__B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_minsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_min_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_minsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_round_ss(__m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_minss_round((__v4sf)__A, (__v4sf)__B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_minss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_min_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_minss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); +} + +#else +#define _mm_max_round_sd(A, B, C) (__m128d) __builtin_ia32_maxsd_round(A, B, C) + +#define _mm_mask_max_round_sd(W, U, A, B, C) \ + (__m128d) __builtin_ia32_maxsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_max_round_sd(U, A, B, C) \ + (__m128d) \ + __builtin_ia32_maxsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_max_round_ss(A, B, C) (__m128) __builtin_ia32_maxss_round(A, B, C) + +#define _mm_mask_max_round_ss(W, U, A, B, C) \ + (__m128) __builtin_ia32_maxss_mask_round(A, B, W, U, C) + +#define _mm_maskz_max_round_ss(U, A, B, C) \ + (__m128) __builtin_ia32_maxss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_min_round_sd(A, B, C) (__m128d) __builtin_ia32_minsd_round(A, B, C) + +#define _mm_mask_min_round_sd(W, U, A, B, C) \ + (__m128d) __builtin_ia32_minsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_min_round_sd(U, A, B, C) \ + (__m128d) \ + __builtin_ia32_minsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_min_round_ss(A, B, C) (__m128) __builtin_ia32_minss_round(A, B, C) + +#define _mm_mask_min_round_ss(W, U, A, B, C) \ + (__m128) __builtin_ia32_minss_mask_round(A, B, W, U, C) + +#define _mm_maskz_min_round_ss(U, A, B, C) \ + (__m128) __builtin_ia32_minss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#endif + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) { + return (__m512d)__builtin_ia32_blendmpd_512_mask((__v8df)__A, (__v8df)__W, + (__mmask8)__U); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) { + return (__m512)__builtin_ia32_blendmps_512_mask((__v16sf)__A, (__v16sf)__W, + (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) { + return (__m512i)__builtin_ia32_blendmq_512_mask((__v8di)__A, (__v8di)__W, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) { + return (__m512i)__builtin_ia32_blendmd_512_mask((__v16si)__A, (__v16si)__W, + (__mmask16)__U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, (__v2df)__A, + (__v2df)__B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, (__v4sf)__A, + (__v4sf)__B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, (__v2df)__A, + -(__v2df)__B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, (__v4sf)__A, + -(__v4sf)__B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, -(__v2df)__A, + (__v2df)__B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, -(__v4sf)__A, + (__v4sf)__B, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, -(__v2df)__A, + -(__v2df)__B, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, -(__v4sf)__A, + -(__v4sf)__B, __R); +} +#else +#define _mm_fmadd_round_sd(A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_round(A, B, C, R) + +#define _mm_fmadd_round_ss(A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_round(A, B, C, R) + +#define _mm_fmsub_round_sd(A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_round(A, B, -(C), R) + +#define _mm_fmsub_round_ss(A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_round(A, B, -(C), R) + +#define _mm_fnmadd_round_sd(A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_round(A, -(B), C, R) + +#define _mm_fnmadd_round_ss(A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_round(A, -(B), C, R) + +#define _mm_fnmsub_round_sd(A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R) + +#define _mm_fnmsub_round_ss(A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_round(A, -(B), -(C), R) +#endif + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, (__v2df)__A, + (__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, (__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmadd_sd(__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)__W, (__v2df)__A, + (__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmadd_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) { + return (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)__W, (__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmadd_sd(__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, (__v2df)__A, + (__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmadd_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, (__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, (__v2df)__A, + -(__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, (__v4sf)__A, + -(__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmsub_sd(__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) { + return (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)__W, (__v2df)__A, + (__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmsub_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) { + return (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)__W, (__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmsub_sd(__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, (__v2df)__A, + -(__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmsub_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, (__v4sf)__A, + -(__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, -(__v2df)__A, + (__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, -(__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmadd_sd(__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)__W, -(__v2df)__A, + (__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmadd_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) { + return (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)__W, -(__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmadd_sd(__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, -(__v2df)__A, + (__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmadd_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, -(__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, -(__v2df)__A, + -(__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, -(__v4sf)__A, + -(__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmsub_sd(__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) { + return (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)__W, -(__v2df)__A, + (__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmsub_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) { + return (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)__W, -(__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmsub_sd(__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, -(__v2df)__A, + -(__v2df)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmsub_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, -(__v4sf)__A, + -(__v4sf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask( + (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, (__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask3( + (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_mask3( + (__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmadd_round_sd(__mmask8 __U, __m128d __W, __m128d __A, + __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_maskz( + (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmadd_round_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_maskz( + (__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask( + (__v2df)__W, (__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_mask( + (__v4sf)__W, (__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U, const int __R) { + return (__m128d)__builtin_ia32_vfmsubsd3_mask3( + (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) { + return (__m128)__builtin_ia32_vfmsubss3_mask3( + (__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmsub_round_sd(__mmask8 __U, __m128d __W, __m128d __A, + __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_maskz( + (__v2df)__W, (__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmsub_round_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_maskz( + (__v4sf)__W, (__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask( + (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, -(__v4sf)__A, + (__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask3( + (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_mask3( + (__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmadd_round_sd(__mmask8 __U, __m128d __W, __m128d __A, + __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_maskz( + (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmadd_round_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_maskz( + (__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_mask( + (__v2df)__W, -(__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_mask( + (__v4sf)__W, -(__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U, const int __R) { + return (__m128d)__builtin_ia32_vfmsubsd3_mask3( + (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) { + return (__m128)__builtin_ia32_vfmsubss3_mask3( + (__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmsub_round_sd(__mmask8 __U, __m128d __W, __m128d __A, + __m128d __B, const int __R) { + return (__m128d)__builtin_ia32_vfmaddsd3_maskz( + (__v2df)__W, -(__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmsub_round_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) { + return (__m128)__builtin_ia32_vfmaddss3_maskz( + (__v4sf)__W, -(__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); +} +#else +#define _mm_mask_fmadd_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask(A, B, C, U, R) + +#define _mm_mask_fmadd_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask(A, B, C, U, R) + +#define _mm_mask3_fmadd_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask3(A, B, C, U, R) + +#define _mm_mask3_fmadd_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask3(A, B, C, U, R) + +#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz(A, B, C, U, R) + +#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz(A, B, C, U, R) + +#define _mm_mask_fmsub_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask(A, B, -(C), U, R) + +#define _mm_mask_fmsub_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask(A, B, -(C), U, R) + +#define _mm_mask3_fmsub_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmsubsd3_mask3(A, B, C, U, R) + +#define _mm_mask3_fmsub_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmsubss3_mask3(A, B, C, U, R) + +#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz(A, B, -(C), U, R) + +#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz(A, B, -(C), U, R) + +#define _mm_mask_fnmadd_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask(A, -(B), C, U, R) + +#define _mm_mask_fnmadd_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask(A, -(B), C, U, R) + +#define _mm_mask3_fnmadd_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask3(A, -(B), C, U, R) + +#define _mm_mask3_fnmadd_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask3(A, -(B), C, U, R) + +#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz(A, -(B), C, U, R) + +#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz(A, -(B), C, U, R) + +#define _mm_mask_fnmsub_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask(A, -(B), -(C), U, R) + +#define _mm_mask_fnmsub_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask(A, -(B), -(C), U, R) + +#define _mm_mask3_fnmsub_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmsubsd3_mask3(A, -(B), C, U, R) + +#define _mm_mask3_fnmsub_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmsubss3_mask3(A, -(B), C, U, R) + +#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz(A, -(B), -(C), U, R) + +#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz(A, -(B), -(C), U, R) +#endif + +#ifdef __OPTIMIZE__ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comi_round_ss(__m128 __A, __m128 __B, const int __P, const int __R) { + return __builtin_ia32_vcomiss((__v4sf)__A, (__v4sf)__B, __P, __R); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comi_round_sd(__m128d __A, __m128d __B, const int __P, const int __R) { + return __builtin_ia32_vcomisd((__v2df)__A, (__v2df)__B, __P, __R); +} +#else +#define _mm_comi_round_ss(A, B, C, D) __builtin_ia32_vcomiss(A, B, C, D) +#define _mm_comi_round_sd(A, B, C, D) __builtin_ia32_vcomisd(A, B, C, D) +#endif + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sqrt_pd(__m512d __A) { + return (__m512d)__builtin_ia32_sqrtpd512_mask( + (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sqrt_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_sqrtpd512_mask( + (__v8df)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sqrt_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_sqrtpd512_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sqrt_ps(__m512 __A) { + return (__m512)__builtin_ia32_sqrtps512_mask( + (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_sqrtps512_mask( + (__v16sf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sqrt_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_sqrtps512_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_add_pd(__m512d __A, __m512d __B) { + return (__m512d)((__v8df)__A + (__v8df)__B); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_addpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_add_ps(__m512 __A, __m512 __B) { + return (__m512)((__v16sf)__A + (__v16sf)__B); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_addps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_addps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_addsd_mask_round((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_addsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_addss_mask_round((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_addss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sub_pd(__m512d __A, __m512d __B) { + return (__m512d)((__v8df)__A - (__v8df)__B); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_subpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_sub_ps(__m512 __A, __m512 __B) { + return (__m512)((__v16sf)__A - (__v16sf)__B); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_subps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_subps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_subsd_mask_round((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_subsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_subss_mask_round((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_subss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mul_pd(__m512d __A, __m512d __B) { + return (__m512d)((__v8df)__A * (__v8df)__B); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_mulpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mul_ps(__m512 __A, __m512 __B) { + return (__m512)((__v16sf)__A * (__v16sf)__B); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_mulps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_mulps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_mulsd_mask_round((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_mulsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_mulss_mask_round((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_mulss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_div_pd(__m512d __M, __m512d __V) { + return (__m512d)((__v8df)__M / (__v8df)__V); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) { + return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, + (__v8df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_div_pd(__mmask8 __U, __m512d __M, __m512d __V) { + return (__m512d)__builtin_ia32_divpd512_mask( + (__v8df)__M, (__v8df)__V, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_div_ps(__m512 __A, __m512 __B) { + return (__m512)((__v16sf)__A / (__v16sf)__B); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_divps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_divps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_divsd_mask_round((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_divsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_divss_mask_round((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_divss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_pd(__m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_maxpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_max_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_maxpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_max_ps(__m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_maxps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_max_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_maxps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_max_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_maxps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_maxsd_mask_round((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_sd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_maxsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_maxss_mask_round((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_ss(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_maxss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_pd(__m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_minpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_min_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_minpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_min_ps(__m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_minps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_min_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_minps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_min_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_minps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_minsd_mask_round((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_sd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_minsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_minss_mask_round((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_ss(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_minss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_scalef_pd(__m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_scalefpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_scalef_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_scalef_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_scalefpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_scalef_ps(__m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_scalefps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_scalef_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_scalefps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_scalef_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_scalefps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_scalef_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_scalefsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_scalef_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_scalefss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { + return (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { + return (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmsubpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmsubpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { + return (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmsubpd512_maskz((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmsubps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmsubps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { + return (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmsubps512_maskz((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_mask3( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmaddsubps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { + return (__m512)__builtin_ia32_vfmaddsubps512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmaddsubps512_maskz( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)__A, (__v8df)__B, + -(__v8df)__C, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( + (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { + return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3( + (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( + (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmaddsubps512_mask( + (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmaddsubps512_mask( + (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { + return (__m512)__builtin_ia32_vfmsubaddps512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfmaddsubps512_maskz( + (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { + return (__m512d)__builtin_ia32_vfnmaddpd512_mask3((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfnmaddpd512_maskz((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { + return (__m512)__builtin_ia32_vfnmaddps512_mask3((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfnmaddps512_maskz((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { + return (__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { + return (__m512d)__builtin_ia32_vfnmsubpd512_maskz((__v8df)__A, (__v8df)__B, + (__v8df)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { + return (__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { + return (__m512)__builtin_ia32_vfnmsubps512_maskz((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvttpd_epi32(__m512d __A) { + return (__m256i)__builtin_ia32_cvttpd2dq512_mask( + (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvttpd_epi32(__m256i __W, __mmask8 __U, __m512d __A) { + return (__m256i)__builtin_ia32_cvttpd2dq512_mask( + (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvttpd_epi32(__mmask8 __U, __m512d __A) { + return (__m256i)__builtin_ia32_cvttpd2dq512_mask( + (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvttpd_epu32(__m512d __A) { + return (__m256i)__builtin_ia32_cvttpd2udq512_mask( + (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvttpd_epu32(__m256i __W, __mmask8 __U, __m512d __A) { + return (__m256i)__builtin_ia32_cvttpd2udq512_mask( + (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvttpd_epu32(__mmask8 __U, __m512d __A) { + return (__m256i)__builtin_ia32_cvttpd2udq512_mask( + (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtpd_epi32(__m512d __A) { + return (__m256i)__builtin_ia32_cvtpd2dq512_mask( + (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtpd_epi32(__m256i __W, __mmask8 __U, __m512d __A) { + return (__m256i)__builtin_ia32_cvtpd2dq512_mask( + (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtpd_epi32(__mmask8 __U, __m512d __A) { + return (__m256i)__builtin_ia32_cvtpd2dq512_mask( + (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtpd_epu32(__m512d __A) { + return (__m256i)__builtin_ia32_cvtpd2udq512_mask( + (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtpd_epu32(__m256i __W, __mmask8 __U, __m512d __A) { + return (__m256i)__builtin_ia32_cvtpd2udq512_mask( + (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtpd_epu32(__mmask8 __U, __m512d __A) { + return (__m256i)__builtin_ia32_cvtpd2udq512_mask( + (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvttps_epi32(__m512 __A) { + return (__m512i)__builtin_ia32_cvttps2dq512_mask( + (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvttps_epi32(__m512i __W, __mmask16 __U, __m512 __A) { + return (__m512i)__builtin_ia32_cvttps2dq512_mask( + (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvttps_epi32(__mmask16 __U, __m512 __A) { + return (__m512i)__builtin_ia32_cvttps2dq512_mask( + (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvttps_epu32(__m512 __A) { + return (__m512i)__builtin_ia32_cvttps2udq512_mask( + (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvttps_epu32(__m512i __W, __mmask16 __U, __m512 __A) { + return (__m512i)__builtin_ia32_cvttps2udq512_mask( + (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvttps_epu32(__mmask16 __U, __m512 __A) { + return (__m512i)__builtin_ia32_cvttps2udq512_mask( + (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtps_epi32(__m512 __A) { + return (__m512i)__builtin_ia32_cvtps2dq512_mask( + (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtps_epi32(__m512i __W, __mmask16 __U, __m512 __A) { + return (__m512i)__builtin_ia32_cvtps2dq512_mask( + (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtps_epi32(__mmask16 __U, __m512 __A) { + return (__m512i)__builtin_ia32_cvtps2dq512_mask( + (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtps_epu32(__m512 __A) { + return (__m512i)__builtin_ia32_cvtps2udq512_mask( + (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtps_epu32(__m512i __W, __mmask16 __U, __m512 __A) { + return (__m512i)__builtin_ia32_cvtps2udq512_mask( + (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtps_epu32(__mmask16 __U, __m512 __A) { + return (__m512i)__builtin_ia32_cvtps2udq512_mask( + (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtsd_f64(__m512d __A) { + return __A[0]; +} + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtss_f32(__m512 __A) { + return __A[0]; +} + +#ifdef __x86_64__ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtu64_ss(__m128 __A, unsigned long long __B) { + return (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)__A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtu64_sd(__m128d __A, unsigned long long __B) { + return (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)__A, __B, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtu32_ss(__m128 __A, unsigned __B) { + return (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)__A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepi32_ps(__m512i __A) { + return (__m512)__builtin_ia32_cvtdq2ps512_mask( + (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepi32_ps(__m512 __W, __mmask16 __U, __m512i __A) { + return (__m512)__builtin_ia32_cvtdq2ps512_mask( + (__v16si)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepi32_ps(__mmask16 __U, __m512i __A) { + return (__m512)__builtin_ia32_cvtdq2ps512_mask( + (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu32_ps(__m512i __A) { + return (__m512)__builtin_ia32_cvtudq2ps512_mask( + (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtepu32_ps(__m512 __W, __mmask16 __U, __m512i __A) { + return (__m512)__builtin_ia32_cvtudq2ps512_mask( + (__v16si)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtepu32_ps(__mmask16 __U, __m512i __A) { + return (__m512)__builtin_ia32_cvtudq2ps512_mask( + (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fixupimm_pd(__m512d __A, __m512d __B, __m512i __C, const int __imm) { + return (__m512d)__builtin_ia32_fixupimmpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fixupimm_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512i __C, + const int __imm) { + return (__m512d)__builtin_ia32_fixupimmpd512_mask( + (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fixupimm_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm) { + return (__m512d)__builtin_ia32_fixupimmpd512_maskz( + (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fixupimm_ps(__m512 __A, __m512 __B, __m512i __C, const int __imm) { + return (__m512)__builtin_ia32_fixupimmps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_fixupimm_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512i __C, + const int __imm) { + return (__m512)__builtin_ia32_fixupimmps512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_fixupimm_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512i __C, + const int __imm) { + return (__m512)__builtin_ia32_fixupimmps512_maskz( + (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fixupimm_sd(__m128d __A, __m128d __B, __m128i __C, const int __imm) { + return (__m128d)__builtin_ia32_fixupimmsd_mask( + (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fixupimm_sd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C, + const int __imm) { + return (__m128d)__builtin_ia32_fixupimmsd_mask( + (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fixupimm_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C, + const int __imm) { + return (__m128d)__builtin_ia32_fixupimmsd_maskz( + (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fixupimm_ss(__m128 __A, __m128 __B, __m128i __C, const int __imm) { + return (__m128)__builtin_ia32_fixupimmss_mask( + (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fixupimm_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C, + const int __imm) { + return (__m128)__builtin_ia32_fixupimmss_mask( + (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fixupimm_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C, + const int __imm) { + return (__m128)__builtin_ia32_fixupimmss_maskz( + (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} +#else +#define _mm512_fixupimm_pd(X, Y, Z, C) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ + (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_fixupimm_pd(X, U, Y, Z, C) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ + (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_fixupimm_pd(U, X, Y, Z, C) \ + ((__m512d)__builtin_ia32_fixupimmpd512_maskz( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), \ + (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_fixupimm_ps(X, Y, Z, C) \ + ((__m512)__builtin_ia32_fixupimmps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ + (int)(C), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_fixupimm_ps(X, U, Y, Z, C) \ + ((__m512)__builtin_ia32_fixupimmps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ + (int)(C), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_fixupimm_ps(U, X, Y, Z, C) \ + ((__m512)__builtin_ia32_fixupimmps512_maskz( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), \ + (int)(C), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_fixupimm_sd(X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ + (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_fixupimm_sd(X, U, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ + (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ + (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_fixupimm_ss(X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_mask( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ + (int)(C), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_fixupimm_ss(X, U, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_mask( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ + (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_maskz( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ + (int)(C), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#endif + +#ifdef __x86_64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_u64(__m128 __A) { + return (unsigned long long)__builtin_ia32_vcvtss2usi64( + (__v4sf)__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_u64(__m128 __A) { + return (unsigned long long)__builtin_ia32_vcvttss2usi64( + (__v4sf)__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_i64(__m128 __A) { + return (long long)__builtin_ia32_vcvttss2si64((__v4sf)__A, + _MM_FROUND_CUR_DIRECTION); +} +#endif /* __x86_64__ */ + +extern __inline unsigned + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_u32(__m128 __A) { + return (unsigned)__builtin_ia32_vcvtss2usi32((__v4sf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_u32(__m128 __A) { + return (unsigned)__builtin_ia32_vcvttss2usi32((__v4sf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_i32(__m128 __A) { + return (int)__builtin_ia32_vcvttss2si32((__v4sf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_u64(__m128d __A) { + return (unsigned long long)__builtin_ia32_vcvtsd2usi64( + (__v2df)__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttsd_u64(__m128d __A) { + return (unsigned long long)__builtin_ia32_vcvttsd2usi64( + (__v2df)__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttsd_i64(__m128d __A) { + return (long long)__builtin_ia32_vcvttsd2si64((__v2df)__A, + _MM_FROUND_CUR_DIRECTION); +} +#endif /* __x86_64__ */ + +extern __inline unsigned + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_u32(__m128d __A) { + return (unsigned)__builtin_ia32_vcvtsd2usi32((__v2df)__A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttsd_u32(__m128d __A) { + return (unsigned)__builtin_ia32_vcvttsd2usi32((__v2df)__A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttsd_i32(__m128d __A) { + return (int)__builtin_ia32_vcvttsd2si32((__v2df)__A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtps_pd(__m256 __A) { + return (__m512d)__builtin_ia32_cvtps2pd512_mask( + (__v8sf)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtps_pd(__m512d __W, __mmask8 __U, __m256 __A) { + return (__m512d)__builtin_ia32_cvtps2pd512_mask( + (__v8sf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) { + return (__m512d)__builtin_ia32_cvtps2pd512_mask( + (__v8sf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtph_ps(__m256i __A) { + return (__m512)__builtin_ia32_vcvtph2ps512_mask( + (__v16hi)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtph_ps(__m512 __W, __mmask16 __U, __m256i __A) { + return (__m512)__builtin_ia32_vcvtph2ps512_mask( + (__v16hi)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtph_ps(__mmask16 __U, __m256i __A) { + return (__m512)__builtin_ia32_vcvtph2ps512_mask( + (__v16hi)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtpd_ps(__m512d __A) { + return (__m256)__builtin_ia32_cvtpd2ps512_mask( + (__v8df)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cvtpd_ps(__m256 __W, __mmask8 __U, __m512d __A) { + return (__m256)__builtin_ia32_cvtpd2ps512_mask( + (__v8df)__A, (__v8sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_cvtpd_ps(__mmask8 __U, __m512d __A) { + return (__m256)__builtin_ia32_cvtpd2ps512_mask( + (__v8df)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_getexp_ps(__m512 __A) { + return (__m512)__builtin_ia32_getexpps512_mask( + (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_getexp_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_getexpps512_mask( + (__v16sf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_getexp_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_getexpps512_mask( + (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_getexp_pd(__m512d __A) { + return (__m512d)__builtin_ia32_getexppd512_mask( + (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_getexp_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_getexppd512_mask( + (__v8df)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_getexp_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_getexppd512_mask( + (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getexp_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_getexpss128_round((__v4sf)__A, (__v4sf)__B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getexp_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_getexpss_mask_round((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getexp_ss(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_getexpss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getexp_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_getexpsd128_round((__v2df)__A, (__v2df)__B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getexp_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_getexpsd_mask_round((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getexp_sd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_getexpsd_mask_round( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_getmant_pd(__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m512d)__builtin_ia32_getmantpd512_mask( + (__v8df)__A, (__C << 2) | __B, _mm512_undefined_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_getmant_pd(__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m512d)__builtin_ia32_getmantpd512_mask( + (__v8df)__A, (__C << 2) | __B, (__v8df)__W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_getmant_pd(__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m512d)__builtin_ia32_getmantpd512_mask( + (__v8df)__A, (__C << 2) | __B, (__v8df)_mm512_setzero_pd(), __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_getmant_ps(__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m512)__builtin_ia32_getmantps512_mask( + (__v16sf)__A, (__C << 2) | __B, _mm512_undefined_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_getmant_ps(__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m512)__builtin_ia32_getmantps512_mask( + (__v16sf)__A, (__C << 2) | __B, (__v16sf)__W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_getmant_ps(__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m512)__builtin_ia32_getmantps512_mask( + (__v16sf)__A, (__C << 2) | __B, (__v16sf)_mm512_setzero_ps(), __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getmant_sd(__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { + return (__m128d)__builtin_ia32_getmantsd_round( + (__v2df)__A, (__v2df)__B, (__D << 2) | __C, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getmant_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { + return (__m128d)__builtin_ia32_getmantsd_mask_round( + (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)__W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getmant_sd(__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { + return (__m128d)__builtin_ia32_getmantsd_mask_round( + (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)_mm_setzero_pd(), __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getmant_ss(__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { + return (__m128)__builtin_ia32_getmantss_round( + (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getmant_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { + return (__m128)__builtin_ia32_getmantss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)__W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getmant_ss(__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { + return (__m128)__builtin_ia32_getmantss_mask_round( + (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)_mm_setzero_ps(), __U, + _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm512_getmant_pd(X, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask( \ + (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), \ + (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_pd(W, U, X, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask( \ + (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), (__v8df)(__m512d)(W), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getmant_pd(U, X, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask( \ + (__v8df)(__m512d)(X), (int)(((C) << 2) | (B)), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_getmant_ps(X, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask( \ + (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), \ + (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_ps(W, U, X, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask( \ + (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), (__v16sf)(__m512)(W), \ + (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getmant_ps(U, X, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask( \ + (__v16sf)(__m512)(X), (int)(((C) << 2) | (B)), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) +#define _mm_getmant_sd(X, Y, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_round( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D) << 2) | (C)), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_sd(W, U, X, Y, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D) << 2) | (C)), \ + (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_sd(U, X, Y, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(((D) << 2) | (C)), \ + (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getmant_ss(X, Y, C, D) \ + ((__m128)__builtin_ia32_getmantss_round( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_ss(W, U, X, Y, C, D) \ + ((__m128)__builtin_ia32_getmantss_mask_round( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), \ + (__v4sf)(__m128)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_ss(U, X, Y, C, D) \ + ((__m128)__builtin_ia32_getmantss_mask_round( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(((D) << 2) | (C)), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getexp_ss(A, B) \ + ((__m128)__builtin_ia32_getexpss128_round( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getexp_ss(W, U, A, B) \ + (__m128) \ + __builtin_ia32_getexpss_mask_round(A, B, W, U, _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_getexp_ss(U, A, B) \ + (__m128) __builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), \ + U, _MM_FROUND_CUR_DIRECTION) + +#define _mm_getexp_sd(A, B) \ + ((__m128d)__builtin_ia32_getexpsd128_round( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getexp_sd(W, U, A, B) \ + (__m128d) \ + __builtin_ia32_getexpsd_mask_round(A, B, W, U, _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_getexp_sd(U, A, B) \ + (__m128d) __builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), \ + U, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_getexp_ps(A) \ + ((__m512)__builtin_ia32_getexpps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getexp_ps(W, U, A) \ + ((__m512)__builtin_ia32_getexpps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)(__m512)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getexp_ps(U, A) \ + ((__m512)__builtin_ia32_getexpps512_mask( \ + (__v16sf)(__m512)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_getexp_pd(A) \ + ((__m512d)__builtin_ia32_getexppd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getexp_pd(W, U, A) \ + ((__m512d)__builtin_ia32_getexppd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)(__m512d)(W), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getexp_pd(U, A) \ + ((__m512d)__builtin_ia32_getexppd512_mask( \ + (__v8df)(__m512d)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_roundscale_ps(__m512 __A, const int __imm) { + return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, __imm, + (__v16sf)_mm512_undefined_ps(), + -1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_roundscale_ps(__m512 __A, __mmask16 __B, __m512 __C, + const int __imm) { + return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__C, __imm, + (__v16sf)__A, (__mmask16)__B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_roundscale_ps(__mmask16 __A, __m512 __B, const int __imm) { + return (__m512)__builtin_ia32_rndscaleps_mask( + (__v16sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), (__mmask16)__A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_roundscale_pd(__m512d __A, const int __imm) { + return (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)__A, __imm, + (__v8df)_mm512_undefined_pd(), + -1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_roundscale_pd(__m512d __A, __mmask8 __B, __m512d __C, + const int __imm) { + return (__m512d)__builtin_ia32_rndscalepd_mask( + (__v8df)__C, __imm, (__v8df)__A, (__mmask8)__B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_roundscale_pd(__mmask8 __A, __m512d __B, const int __imm) { + return (__m512d)__builtin_ia32_rndscalepd_mask( + (__v8df)__B, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)__A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roundscale_ss(__m128 __A, __m128 __B, const int __imm) { + return (__m128)__builtin_ia32_rndscaless_round( + (__v4sf)__A, (__v4sf)__B, __imm, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roundscale_sd(__m128d __A, __m128d __B, const int __imm) { + return (__m128d)__builtin_ia32_rndscalesd_round( + (__v2df)__A, (__v2df)__B, __imm, _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm512_roundscale_ps(A, B) \ + ((__m512)__builtin_ia32_rndscaleps_mask( \ + (__v16sf)(__m512)(A), (int)(B), (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_roundscale_ps(A, B, C, D) \ + ((__m512)__builtin_ia32_rndscaleps_mask( \ + (__v16sf)(__m512)(C), (int)(D), (__v16sf)(__m512)(A), (__mmask16)(B), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_roundscale_ps(A, B, C) \ + ((__m512)__builtin_ia32_rndscaleps_mask( \ + (__v16sf)(__m512)(B), (int)(C), (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(A), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_roundscale_pd(A, B) \ + ((__m512d)__builtin_ia32_rndscalepd_mask( \ + (__v8df)(__m512d)(A), (int)(B), (__v8df)_mm512_undefined_pd(), \ + (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_roundscale_pd(A, B, C, D) \ + ((__m512d)__builtin_ia32_rndscalepd_mask( \ + (__v8df)(__m512d)(C), (int)(D), (__v8df)(__m512d)(A), (__mmask8)(B), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_roundscale_pd(A, B, C) \ + ((__m512d)__builtin_ia32_rndscalepd_mask( \ + (__v8df)(__m512d)(B), (int)(C), (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(A), _MM_FROUND_CUR_DIRECTION)) +#define _mm_roundscale_ss(A, B, C) \ + ((__m128)__builtin_ia32_rndscaless_round((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_roundscale_sd(A, B, C) \ + ((__m128d)__builtin_ia32_rndscalesd_round((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + _MM_FROUND_CUR_DIRECTION)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_pd_mask(__m512d __X, __m512d __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmppd512_mask( + (__v8df)__X, (__v8df)__Y, __P, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmp_ps_mask(__m512 __X, __m512 __Y, const int __P) { + return (__mmask16)__builtin_ia32_cmpps512_mask( + (__v16sf)__X, (__v16sf)__Y, __P, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_cmp_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y, const int __P) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + __P, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_cmp_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmppd512_mask( + (__v8df)__X, (__v8df)__Y, __P, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpeq_pd_mask(__m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_EQ_OQ, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpeq_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_EQ_OQ, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmplt_pd_mask(__m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_LT_OS, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmplt_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_LT_OS, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmple_pd_mask(__m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_LE_OS, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmple_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_LE_OS, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpunord_pd_mask(__m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_UNORD_Q, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpunord_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_UNORD_Q, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpneq_pd_mask(__m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_NEQ_UQ, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpneq_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_NEQ_UQ, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpnlt_pd_mask(__m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_NLT_US, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpnlt_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_NLT_US, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpnle_pd_mask(__m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_NLE_US, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpnle_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_NLE_US, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpord_pd_mask(__m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_ORD_Q, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpord_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { + return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, + _CMP_ORD_Q, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpeq_ps_mask(__m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_EQ_OQ, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpeq_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_EQ_OQ, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmplt_ps_mask(__m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_LT_OS, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmplt_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_LT_OS, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmple_ps_mask(__m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_LE_OS, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmple_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_LE_OS, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpunord_ps_mask(__m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_UNORD_Q, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpunord_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_UNORD_Q, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpneq_ps_mask(__m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_NEQ_UQ, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpneq_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_NEQ_UQ, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpnlt_ps_mask(__m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_NLT_US, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpnlt_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_NLT_US, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpnle_ps_mask(__m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_NLE_US, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpnle_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_NLE_US, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpord_ps_mask(__m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_ORD_Q, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpord_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { + return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, + _CMP_ORD_Q, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_sd_mask(__m128d __X, __m128d __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpsd_mask( + (__v2df)__X, (__v2df)__Y, __P, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_cmp_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpsd_mask( + (__v2df)__X, (__v2df)__Y, __P, (__mmask8)__M, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_ss_mask(__m128 __X, __m128 __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpss_mask( + (__v4sf)__X, (__v4sf)__Y, __P, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmp_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpss_mask( + (__v4sf)__X, (__v4sf)__Y, __P, (__mmask8)__M, _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm512_cmp_pd_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmppd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_cmp_ps_mask(X, Y, P) \ + ((__mmask16)__builtin_ia32_cmpps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_cmp_pd_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmppd512_mask( \ + (__v8df)(__m512d)(X), (__v8df)(__m512d)(Y), (int)(P), (__mmask8)M, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_cmp_ps_mask(M, X, Y, P) \ + ((__mmask16)__builtin_ia32_cmpps512_mask( \ + (__v16sf)(__m512)(X), (__v16sf)(__m512)(Y), (int)(P), (__mmask16)M, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_cmp_sd_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpsd_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_sd_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P), M, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_cmp_ss_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpss_mask( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_ss_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), M, \ + _MM_FROUND_CUR_DIRECTION)) +#endif + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_kmov(__mmask16 __A) { + return __builtin_ia32_kmovw(__A); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castpd_ps(__m512d __A) { + return (__m512)(__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castpd_si512(__m512d __A) { + return (__m512i)(__A); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castps_pd(__m512 __A) { + return (__m512d)(__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castps_si512(__m512 __A) { + return (__m512i)(__A); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castsi512_ps(__m512i __A) { + return (__m512)(__A); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castsi512_pd(__m512i __A) { + return (__m512d)(__A); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castpd512_pd128(__m512d __A) { + return (__m128d)_mm512_extractf32x4_ps((__m512)__A, 0); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castps512_ps128(__m512 __A) { + return _mm512_extractf32x4_ps(__A, 0); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castsi512_si128(__m512i __A) { + return (__m128i)_mm512_extracti32x4_epi32((__m512i)__A, 0); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castpd512_pd256(__m512d __A) { + return _mm512_extractf64x4_pd(__A, 0); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castps512_ps256(__m512 __A) { + return (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castsi512_si256(__m512i __A) { + return (__m256i)_mm512_extractf64x4_pd((__m512d)__A, 0); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castpd128_pd512(__m128d __A) { + return (__m512d)__builtin_ia32_pd512_pd((__m128d)__A); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castps128_ps512(__m128 __A) { + return (__m512)__builtin_ia32_ps512_ps((__m128)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castsi128_si512(__m128i __A) { + return (__m512i)__builtin_ia32_si512_si((__v4si)__A); +} + +extern __inline __m512d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castpd256_pd512(__m256d __A) { + return __builtin_ia32_pd512_256pd(__A); +} + +extern __inline __m512 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castps256_ps512(__m256 __A) { + return __builtin_ia32_ps512_256ps(__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_castsi256_si512(__m256i __A) { + return (__m512i)__builtin_ia32_si512_256si((__v8si)__A); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpeq_epu32_mask(__m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 0, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpeq_epu32_mask(__mmask16 __U, __m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 0, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpeq_epu64_mask(__mmask8 __U, __m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 0, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpeq_epu64_mask(__m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 0, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpgt_epu32_mask(__m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 6, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpgt_epu32_mask(__mmask16 __U, __m512i __A, __m512i __B) { + return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 6, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_cmpgt_epu64_mask(__mmask8 __U, __m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 6, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cmpgt_epu64_mask(__m512i __A, __m512i __B) { + return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 6, + (__mmask8)-1); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __v8si __T1 = (__v8si)_mm512_extracti64x4_epi64(__A, 1); \ + __v8si __T2 = (__v8si)_mm512_extracti64x4_epi64(__A, 0); \ + __m256i __T3 = (__m256i)(__T1 op __T2); \ + __v4si __T4 = (__v4si)_mm256_extracti128_si256(__T3, 1); \ + __v4si __T5 = (__v4si)_mm256_extracti128_si256(__T3, 0); \ + __v4si __T6 = __T4 op __T5; \ + __v4si __T7 = __builtin_shuffle(__T6, (__v4si){2, 3, 0, 1}); \ + __v4si __T8 = __T6 op __T7; \ + return __T8[0] op __T8[1] + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_add_epi32(__m512i __A) { + __MM512_REDUCE_OP(+); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_mul_epi32(__m512i __A) { + __MM512_REDUCE_OP(*); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_and_epi32(__m512i __A) { + __MM512_REDUCE_OP(&); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_or_epi32(__m512i __A) { + __MM512_REDUCE_OP(|); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_add_epi32(__mmask16 __U, __m512i __A) { + __A = _mm512_maskz_mov_epi32(__U, __A); + __MM512_REDUCE_OP(+); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_mul_epi32(__mmask16 __U, __m512i __A) { + __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __U, __A); + __MM512_REDUCE_OP(*); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_and_epi32(__mmask16 __U, __m512i __A) { + __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0), __U, __A); + __MM512_REDUCE_OP(&); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_or_epi32(__mmask16 __U, __m512i __A) { + __A = _mm512_maskz_mov_epi32(__U, __A); + __MM512_REDUCE_OP(|); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256i __T1 = (__m256i)_mm512_extracti64x4_epi64(__A, 1); \ + __m256i __T2 = (__m256i)_mm512_extracti64x4_epi64(__A, 0); \ + __m256i __T3 = _mm256_##op(__T1, __T2); \ + __m128i __T4 = (__m128i)_mm256_extracti128_si256(__T3, 1); \ + __m128i __T5 = (__m128i)_mm256_extracti128_si256(__T3, 0); \ + __m128i __T6 = _mm_##op(__T4, __T5); \ + __m128i __T7 = \ + (__m128i)__builtin_shuffle((__v4si)__T6, (__v4si){2, 3, 0, 1}); \ + __m128i __T8 = _mm_##op(__T6, __T7); \ + __m128i __T9 = \ + (__m128i)__builtin_shuffle((__v4si)__T8, (__v4si){1, 0, 1, 0}); \ + __v4si __T10 = (__v4si)_mm_##op(__T8, __T9); \ + return __T10[0] + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_min_epi32(__m512i __A) { + __MM512_REDUCE_OP(min_epi32); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_max_epi32(__m512i __A) { + __MM512_REDUCE_OP(max_epi32); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_min_epu32(__m512i __A) { + __MM512_REDUCE_OP(min_epu32); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_max_epu32(__m512i __A) { + __MM512_REDUCE_OP(max_epu32); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_min_epi32(__mmask16 __U, __m512i __A) { + __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __U, __A); + __MM512_REDUCE_OP(min_epi32); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_max_epi32(__mmask16 __U, __m512i __A) { + __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __U, __A); + __MM512_REDUCE_OP(max_epi32); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_min_epu32(__mmask16 __U, __m512i __A) { + __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0), __U, __A); + __MM512_REDUCE_OP(min_epu32); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_max_epu32(__mmask16 __U, __m512i __A) { + __A = _mm512_maskz_mov_epi32(__U, __A); + __MM512_REDUCE_OP(max_epu32); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256 __T1 = (__m256)_mm512_extractf64x4_pd((__m512d)__A, 1); \ + __m256 __T2 = (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0); \ + __m256 __T3 = __T1 op __T2; \ + __m128 __T4 = _mm256_extractf128_ps(__T3, 1); \ + __m128 __T5 = _mm256_extractf128_ps(__T3, 0); \ + __m128 __T6 = __T4 op __T5; \ + __m128 __T7 = __builtin_shuffle(__T6, (__v4si){2, 3, 0, 1}); \ + __m128 __T8 = __T6 op __T7; \ + return __T8[0] op __T8[1] + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_add_ps(__m512 __A) { + __MM512_REDUCE_OP(+); +} + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_mul_ps(__m512 __A) { + __MM512_REDUCE_OP(*); +} + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_add_ps(__mmask16 __U, __m512 __A) { + __A = _mm512_maskz_mov_ps(__U, __A); + __MM512_REDUCE_OP(+); +} + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_mul_ps(__mmask16 __U, __m512 __A) { + __A = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __U, __A); + __MM512_REDUCE_OP(*); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256 __T1 = (__m256)_mm512_extractf64x4_pd((__m512d)__A, 1); \ + __m256 __T2 = (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0); \ + __m256 __T3 = _mm256_##op(__T1, __T2); \ + __m128 __T4 = _mm256_extractf128_ps(__T3, 1); \ + __m128 __T5 = _mm256_extractf128_ps(__T3, 0); \ + __m128 __T6 = _mm_##op(__T4, __T5); \ + __m128 __T7 = __builtin_shuffle(__T6, (__v4si){2, 3, 0, 1}); \ + __m128 __T8 = _mm_##op(__T6, __T7); \ + __m128 __T9 = __builtin_shuffle(__T8, (__v4si){1, 0, 1, 0}); \ + __m128 __T10 = _mm_##op(__T8, __T9); \ + return __T10[0] + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_min_ps(__m512 __A) { + __MM512_REDUCE_OP(min_ps); +} + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_max_ps(__m512 __A) { + __MM512_REDUCE_OP(max_ps); +} + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_min_ps(__mmask16 __U, __m512 __A) { + __A = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __U, __A); + __MM512_REDUCE_OP(min_ps); +} + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_max_ps(__mmask16 __U, __m512 __A) { + __A = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __U, __A); + __MM512_REDUCE_OP(max_ps); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __v4di __T1 = (__v4di)_mm512_extracti64x4_epi64(__A, 1); \ + __v4di __T2 = (__v4di)_mm512_extracti64x4_epi64(__A, 0); \ + __m256i __T3 = (__m256i)(__T1 op __T2); \ + __v2di __T4 = (__v2di)_mm256_extracti128_si256(__T3, 1); \ + __v2di __T5 = (__v2di)_mm256_extracti128_si256(__T3, 0); \ + __v2di __T6 = __T4 op __T5; \ + return __T6[0] op __T6[1] + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_add_epi64(__m512i __A) { + __MM512_REDUCE_OP(+); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_mul_epi64(__m512i __A) { + __MM512_REDUCE_OP(*); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_and_epi64(__m512i __A) { + __MM512_REDUCE_OP(&); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_or_epi64(__m512i __A) { + __MM512_REDUCE_OP(|); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_add_epi64(__mmask8 __U, __m512i __A) { + __A = _mm512_maskz_mov_epi64(__U, __A); + __MM512_REDUCE_OP(+); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_mul_epi64(__mmask8 __U, __m512i __A) { + __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(1LL), __U, __A); + __MM512_REDUCE_OP(*); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_and_epi64(__mmask8 __U, __m512i __A) { + __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0LL), __U, __A); + __MM512_REDUCE_OP(&); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_or_epi64(__mmask8 __U, __m512i __A) { + __A = _mm512_maskz_mov_epi64(__U, __A); + __MM512_REDUCE_OP(|); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m512i __T1 = _mm512_shuffle_i64x2(__A, __A, 0x4e); \ + __m512i __T2 = _mm512_##op(__A, __T1); \ + __m512i __T3 = (__m512i)__builtin_shuffle((__v8di)__T2, \ + (__v8di){2, 3, 0, 1, 6, 7, 4, 5}); \ + __m512i __T4 = _mm512_##op(__T2, __T3); \ + __m512i __T5 = (__m512i)__builtin_shuffle((__v8di)__T4, \ + (__v8di){1, 0, 3, 2, 5, 4, 7, 6}); \ + __v8di __T6 = (__v8di)_mm512_##op(__T4, __T5); \ + return __T6[0] + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_min_epi64(__m512i __A) { + __MM512_REDUCE_OP(min_epi64); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_max_epi64(__m512i __A) { + __MM512_REDUCE_OP(max_epi64); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_min_epi64(__mmask8 __U, __m512i __A) { + __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __U, __A); + __MM512_REDUCE_OP(min_epi64); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_max_epi64(__mmask8 __U, __m512i __A) { + __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1), __U, + __A); + __MM512_REDUCE_OP(max_epi64); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_min_epu64(__m512i __A) { + __MM512_REDUCE_OP(min_epu64); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_max_epu64(__m512i __A) { + __MM512_REDUCE_OP(max_epu64); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_min_epu64(__mmask8 __U, __m512i __A) { + __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0LL), __U, __A); + __MM512_REDUCE_OP(min_epu64); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_max_epu64(__mmask8 __U, __m512i __A) { + __A = _mm512_maskz_mov_epi64(__U, __A); + __MM512_REDUCE_OP(max_epu64); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256d __T1 = (__m256d)_mm512_extractf64x4_pd(__A, 1); \ + __m256d __T2 = (__m256d)_mm512_extractf64x4_pd(__A, 0); \ + __m256d __T3 = __T1 op __T2; \ + __m128d __T4 = _mm256_extractf128_pd(__T3, 1); \ + __m128d __T5 = _mm256_extractf128_pd(__T3, 0); \ + __m128d __T6 = __T4 op __T5; \ + return __T6[0] op __T6[1] + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_add_pd(__m512d __A) { + __MM512_REDUCE_OP(+); +} + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_mul_pd(__m512d __A) { + __MM512_REDUCE_OP(*); +} + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_add_pd(__mmask8 __U, __m512d __A) { + __A = _mm512_maskz_mov_pd(__U, __A); + __MM512_REDUCE_OP(+); +} + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_mul_pd(__mmask8 __U, __m512d __A) { + __A = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __U, __A); + __MM512_REDUCE_OP(*); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256d __T1 = (__m256d)_mm512_extractf64x4_pd(__A, 1); \ + __m256d __T2 = (__m256d)_mm512_extractf64x4_pd(__A, 0); \ + __m256d __T3 = _mm256_##op(__T1, __T2); \ + __m128d __T4 = _mm256_extractf128_pd(__T3, 1); \ + __m128d __T5 = _mm256_extractf128_pd(__T3, 0); \ + __m128d __T6 = _mm_##op(__T4, __T5); \ + __m128d __T7 = (__m128d)__builtin_shuffle(__T6, (__v2di){1, 0}); \ + __m128d __T8 = _mm_##op(__T6, __T7); \ + return __T8[0] + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_min_pd(__m512d __A) { + __MM512_REDUCE_OP(min_pd); +} + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_reduce_max_pd(__m512d __A) { + __MM512_REDUCE_OP(max_pd); +} + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_min_pd(__mmask8 __U, __m512d __A) { + __A = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __U, __A); + __MM512_REDUCE_OP(min_pd); +} + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_reduce_max_pd(__mmask8 __U, __m512d __A) { + __A = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __U, __A); + __MM512_REDUCE_OP(max_pd); +} + +#undef __MM512_REDUCE_OP + +#ifdef __DISABLE_AVX512F__ +#undef __DISABLE_AVX512F__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512F__ */ + +#endif /* _AVX512FINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512ifmaintrin.internal.h b/third_party/intel/avx512ifmaintrin.internal.h new file mode 100644 index 000000000..4efd6fe4f --- /dev/null +++ b/third_party/intel/avx512ifmaintrin.internal.h @@ -0,0 +1,65 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512IFMAINTRIN_H_INCLUDED +#define _AVX512IFMAINTRIN_H_INCLUDED + +#ifndef __AVX512IFMA__ +#pragma GCC push_options +#pragma GCC target("avx512ifma") +#define __DISABLE_AVX512IFMA__ +#endif /* __AVX512IFMA__ */ + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) { + return (__m512i)__builtin_ia32_vpmadd52luq512_mask((__v8di)__X, (__v8di)__Y, + (__v8di)__Z, (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) { + return (__m512i)__builtin_ia32_vpmadd52huq512_mask((__v8di)__X, (__v8di)__Y, + (__v8di)__Z, (__mmask8)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) { + return (__m512i)__builtin_ia32_vpmadd52luq512_mask( + (__v8di)__W, (__v8di)__X, (__v8di)__Y, (__mmask8)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) { + return (__m512i)__builtin_ia32_vpmadd52huq512_mask( + (__v8di)__W, (__v8di)__X, (__v8di)__Y, (__mmask8)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, __m512i __Y, + __m512i __Z) { + return (__m512i)__builtin_ia32_vpmadd52luq512_maskz( + (__v8di)__X, (__v8di)__Y, (__v8di)__Z, (__mmask8)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, __m512i __Y, + __m512i __Z) { + return (__m512i)__builtin_ia32_vpmadd52huq512_maskz( + (__v8di)__X, (__v8di)__Y, (__v8di)__Z, (__mmask8)__M); +} + +#ifdef __DISABLE_AVX512IFMA__ +#undef __DISABLE_AVX512IFMA__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512IFMA__ */ + +#endif /* _AVX512IFMAINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512ifmavlintrin.internal.h b/third_party/intel/avx512ifmavlintrin.internal.h new file mode 100644 index 000000000..2f7abafd3 --- /dev/null +++ b/third_party/intel/avx512ifmavlintrin.internal.h @@ -0,0 +1,108 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include instead." +#endif + +#ifndef _AVX512IFMAVLINTRIN_H_INCLUDED +#define _AVX512IFMAVLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512IFMA__) +#pragma GCC push_options +#pragma GCC target("avx512ifma,avx512vl") +#define __DISABLE_AVX512IFMAVL__ +#endif /* __AVX512IFMAVL__ */ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) { + return (__m128i)__builtin_ia32_vpmadd52luq128_mask((__v2di)__X, (__v2di)__Y, + (__v2di)__Z, (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) { + return (__m128i)__builtin_ia32_vpmadd52huq128_mask((__v2di)__X, (__v2di)__Y, + (__v2di)__Z, (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) { + return (__m256i)__builtin_ia32_vpmadd52luq256_mask((__v4di)__X, (__v4di)__Y, + (__v4di)__Z, (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) { + return (__m256i)__builtin_ia32_vpmadd52huq256_mask((__v4di)__X, (__v4di)__Y, + (__v4di)__Z, (__mmask8)-1); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_vpmadd52luq128_mask( + (__v2di)__W, (__v2di)__X, (__v2di)__Y, (__mmask8)__M); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_vpmadd52huq128_mask( + (__v2di)__W, (__v2di)__X, (__v2di)__Y, (__mmask8)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_madd52lo_epu64(__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) { + return (__m256i)__builtin_ia32_vpmadd52luq256_mask( + (__v4di)__W, (__v4di)__X, (__v4di)__Y, (__mmask8)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_madd52hi_epu64(__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) { + return (__m256i)__builtin_ia32_vpmadd52huq256_mask( + (__v4di)__W, (__v4di)__X, (__v4di)__Y, (__mmask8)__M); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { + return (__m128i)__builtin_ia32_vpmadd52luq128_maskz( + (__v2di)__X, (__v2di)__Y, (__v2di)__Z, (__mmask8)__M); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { + return (__m128i)__builtin_ia32_vpmadd52huq128_maskz( + (__v2di)__X, (__v2di)__Y, (__v2di)__Z, (__mmask8)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_madd52lo_epu64(__mmask8 __M, __m256i __X, __m256i __Y, + __m256i __Z) { + return (__m256i)__builtin_ia32_vpmadd52luq256_maskz( + (__v4di)__X, (__v4di)__Y, (__v4di)__Z, (__mmask8)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_madd52hi_epu64(__mmask8 __M, __m256i __X, __m256i __Y, + __m256i __Z) { + return (__m256i)__builtin_ia32_vpmadd52huq256_maskz( + (__v4di)__X, (__v4di)__Y, (__v4di)__Z, (__mmask8)__M); +} + +#ifdef __DISABLE_AVX512IFMAVL__ +#undef __DISABLE_AVX512IFMAVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512IFMAVL__ */ + +#endif /* _AVX512IFMAVLINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512pfintrin.internal.h b/third_party/intel/avx512pfintrin.internal.h new file mode 100644 index 000000000..e73bc9081 --- /dev/null +++ b/third_party/intel/avx512pfintrin.internal.h @@ -0,0 +1,221 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512PFINTRIN_H_INCLUDED +#define _AVX512PFINTRIN_H_INCLUDED + +#ifndef __AVX512PF__ +#pragma GCC push_options +#pragma GCC target("avx512pf") +#define __DISABLE_AVX512PF__ +#endif /* __AVX512PF__ */ + +typedef long long __v8di __attribute__((__vector_size__(64))); +typedef int __v16si __attribute__((__vector_size__(64))); +typedef long long __m512i __attribute__((__vector_size__(64), __may_alias__)); +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +#ifdef __OPTIMIZE__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_prefetch_i32gather_pd(__m256i __index, void const *__addr, + int __scale, int __hint) { + __builtin_ia32_gatherpfdpd((__mmask8)0xFF, (__v8si)__index, __addr, __scale, + __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_prefetch_i32gather_ps(__m512i __index, void const *__addr, + int __scale, int __hint) { + __builtin_ia32_gatherpfdps((__mmask16)0xFFFF, (__v16si)__index, __addr, + __scale, __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_prefetch_i32gather_pd(__m256i __index, __mmask8 __mask, + void const *__addr, int __scale, + int __hint) { + __builtin_ia32_gatherpfdpd(__mask, (__v8si)__index, __addr, __scale, __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_prefetch_i32gather_ps(__m512i __index, __mmask16 __mask, + void const *__addr, int __scale, + int __hint) { + __builtin_ia32_gatherpfdps(__mask, (__v16si)__index, __addr, __scale, __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_prefetch_i64gather_pd(__m512i __index, void const *__addr, + int __scale, int __hint) { + __builtin_ia32_gatherpfqpd((__mmask8)0xFF, (__v8di)__index, __addr, __scale, + __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_prefetch_i64gather_ps(__m512i __index, void const *__addr, + int __scale, int __hint) { + __builtin_ia32_gatherpfqps((__mmask8)0xFF, (__v8di)__index, __addr, __scale, + __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_prefetch_i64gather_pd(__m512i __index, __mmask8 __mask, + void const *__addr, int __scale, + int __hint) { + __builtin_ia32_gatherpfqpd(__mask, (__v8di)__index, __addr, __scale, __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_prefetch_i64gather_ps(__m512i __index, __mmask8 __mask, + void const *__addr, int __scale, + int __hint) { + __builtin_ia32_gatherpfqps(__mask, (__v8di)__index, __addr, __scale, __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_prefetch_i32scatter_pd(void *__addr, __m256i __index, int __scale, + int __hint) { + __builtin_ia32_scatterpfdpd((__mmask8)0xFF, (__v8si)__index, __addr, __scale, + __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_prefetch_i32scatter_ps(void *__addr, __m512i __index, int __scale, + int __hint) { + __builtin_ia32_scatterpfdps((__mmask16)0xFFFF, (__v16si)__index, __addr, + __scale, __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_prefetch_i32scatter_pd(void *__addr, __mmask8 __mask, + __m256i __index, int __scale, + int __hint) { + __builtin_ia32_scatterpfdpd(__mask, (__v8si)__index, __addr, __scale, __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_prefetch_i32scatter_ps(void *__addr, __mmask16 __mask, + __m512i __index, int __scale, + int __hint) { + __builtin_ia32_scatterpfdps(__mask, (__v16si)__index, __addr, __scale, + __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_prefetch_i64scatter_pd(void *__addr, __m512i __index, int __scale, + int __hint) { + __builtin_ia32_scatterpfqpd((__mmask8)0xFF, (__v8di)__index, __addr, __scale, + __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_prefetch_i64scatter_ps(void *__addr, __m512i __index, int __scale, + int __hint) { + __builtin_ia32_scatterpfqps((__mmask8)0xFF, (__v8di)__index, __addr, __scale, + __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_prefetch_i64scatter_pd(void *__addr, __mmask8 __mask, + __m512i __index, int __scale, + int __hint) { + __builtin_ia32_scatterpfqpd(__mask, (__v8di)__index, __addr, __scale, __hint); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_prefetch_i64scatter_ps(void *__addr, __mmask8 __mask, + __m512i __index, int __scale, + int __hint) { + __builtin_ia32_scatterpfqps(__mask, (__v8di)__index, __addr, __scale, __hint); +} + +#else +#define _mm512_prefetch_i32gather_pd(INDEX, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfdpd((__mmask8)0xFF, (__v8si)(__m256i)INDEX, \ + (void const *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_prefetch_i32gather_ps(INDEX, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfdps((__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, \ + (void const *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i32gather_pd(INDEX, MASK, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfdpd((__mmask8)MASK, (__v8si)(__m256i)INDEX, \ + (void const *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i32gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfdps((__mmask16)MASK, (__v16si)(__m512i)INDEX, \ + (void const *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_prefetch_i64gather_pd(INDEX, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfqpd((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_prefetch_i64gather_ps(INDEX, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfqps((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i64gather_pd(INDEX, MASK, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfqpd((__mmask8)MASK, (__v8di)(__m512i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i64gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfqps((__mmask8)MASK, (__v8di)(__m512i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_prefetch_i32scatter_pd(ADDR, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfdpd((__mmask8)0xFF, (__v8si)(__m256i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_prefetch_i32scatter_ps(ADDR, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfdps((__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i32scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfdpd((__mmask8)MASK, (__v8si)(__m256i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i32scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfdps((__mmask16)MASK, (__v16si)(__m512i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_prefetch_i64scatter_pd(ADDR, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfqpd((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_prefetch_i64scatter_ps(ADDR, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfqps((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i64scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfqpd((__mmask8)MASK, (__v8di)(__m512i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) + +#define _mm512_mask_prefetch_i64scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfqps((__mmask8)MASK, (__v8di)(__m512i)INDEX, \ + (void *)ADDR, (int)SCALE, (int)HINT) +#endif + +#ifdef __DISABLE_AVX512PF__ +#undef __DISABLE_AVX512PF__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512PF__ */ + +#endif /* _AVX512PFINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512vbmi2intrin.internal.h b/third_party/intel/avx512vbmi2intrin.internal.h new file mode 100644 index 000000000..8200b23b9 --- /dev/null +++ b/third_party/intel/avx512vbmi2intrin.internal.h @@ -0,0 +1,455 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include instead." +#endif + +#ifndef __AVX512VBMI2INTRIN_H_INCLUDED +#define __AVX512VBMI2INTRIN_H_INCLUDED + +#if !defined(__AVX512VBMI2__) +#pragma GCC push_options +#pragma GCC target("avx512vbmi2") +#define __DISABLE_AVX512VBMI2__ +#endif /* __AVX512VBMI2__ */ + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shrdi_epi16(__m512i __A, __m512i __B, int __C) { + return (__m512i)__builtin_ia32_vpshrd_v32hi((__v32hi)__A, (__v32hi)__B, __C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shrdi_epi32(__m512i __A, __m512i __B, int __C) { + return (__m512i)__builtin_ia32_vpshrd_v16si((__v16si)__A, (__v16si)__B, __C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shrdi_epi32(__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D, int __E) { + return (__m512i)__builtin_ia32_vpshrd_v16si_mask( + (__v16si)__C, (__v16si)__D, __E, (__v16si)__A, (__mmask16)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shrdi_epi32(__mmask16 __A, __m512i __B, __m512i __C, int __D) { + return (__m512i)__builtin_ia32_vpshrd_v16si_mask( + (__v16si)__B, (__v16si)__C, __D, (__v16si)_mm512_setzero_si512(), + (__mmask16)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shrdi_epi64(__m512i __A, __m512i __B, int __C) { + return (__m512i)__builtin_ia32_vpshrd_v8di((__v8di)__A, (__v8di)__B, __C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shrdi_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D, + int __E) { + return (__m512i)__builtin_ia32_vpshrd_v8di_mask((__v8di)__C, (__v8di)__D, __E, + (__v8di)__A, (__mmask8)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shrdi_epi64(__mmask8 __A, __m512i __B, __m512i __C, int __D) { + return (__m512i)__builtin_ia32_vpshrd_v8di_mask( + (__v8di)__B, (__v8di)__C, __D, (__v8di)_mm512_setzero_si512(), + (__mmask8)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shldi_epi16(__m512i __A, __m512i __B, int __C) { + return (__m512i)__builtin_ia32_vpshld_v32hi((__v32hi)__A, (__v32hi)__B, __C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shldi_epi32(__m512i __A, __m512i __B, int __C) { + return (__m512i)__builtin_ia32_vpshld_v16si((__v16si)__A, (__v16si)__B, __C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shldi_epi32(__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D, int __E) { + return (__m512i)__builtin_ia32_vpshld_v16si_mask( + (__v16si)__C, (__v16si)__D, __E, (__v16si)__A, (__mmask16)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shldi_epi32(__mmask16 __A, __m512i __B, __m512i __C, int __D) { + return (__m512i)__builtin_ia32_vpshld_v16si_mask( + (__v16si)__B, (__v16si)__C, __D, (__v16si)_mm512_setzero_si512(), + (__mmask16)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shldi_epi64(__m512i __A, __m512i __B, int __C) { + return (__m512i)__builtin_ia32_vpshld_v8di((__v8di)__A, (__v8di)__B, __C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shldi_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D, + int __E) { + return (__m512i)__builtin_ia32_vpshld_v8di_mask((__v8di)__C, (__v8di)__D, __E, + (__v8di)__A, (__mmask8)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shldi_epi64(__mmask8 __A, __m512i __B, __m512i __C, int __D) { + return (__m512i)__builtin_ia32_vpshld_v8di_mask( + (__v8di)__B, (__v8di)__C, __D, (__v8di)_mm512_setzero_si512(), + (__mmask8)__A); +} +#else +#define _mm512_shrdi_epi16(A, B, C) \ + ((__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B),(int)(C)) +#define _mm512_shrdi_epi32(A, B, C) \ + ((__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B),(int)(C)) +#define _mm512_mask_shrdi_epi32(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(C), \ + (__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A),(__mmask16)(B)) +#define _mm512_maskz_shrdi_epi32(A, B, C, D) \ + ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(C),(int)(D), \ + (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A)) +#define _mm512_shrdi_epi64(A, B, C) \ + ((__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B),(int)(C)) +#define _mm512_mask_shrdi_epi64(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(C), \ + (__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A),(__mmask8)(B)) +#define _mm512_maskz_shrdi_epi64(A, B, C, D) \ + ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(C),(int)(D), \ + (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A)) +#define _mm512_shldi_epi16(A, B, C) \ + ((__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B),(int)(C)) +#define _mm512_shldi_epi32(A, B, C) \ + ((__m512i) __builtin_ia32_vpshld_v16si ((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B),(int)(C)) +#define _mm512_mask_shldi_epi32(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(C), \ + (__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A),(__mmask16)(B)) +#define _mm512_maskz_shldi_epi32(A, B, C, D) \ + ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(C),(int)(D), \ + (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A)) +#define _mm512_shldi_epi64(A, B, C) \ + ((__m512i) __builtin_ia32_vpshld_v8di ((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B),(int)(C)) +#define _mm512_mask_shldi_epi64(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(C), \ + (__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A),(__mmask8)(B)) +#define _mm512_maskz_shldi_epi64(A, B, C, D) \ + ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(C),(int)(D), \ + (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A)) +#endif + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vpshrdv_v32hi((__v32hi)__A, (__v32hi)__B, + (__v32hi)__C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vpshrdv_v16si((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshrdv_v16si_mask( + (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_shrdv_epi32(__mmask16 __A, __m512i __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshrdv_v16si_maskz( + (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vpshrdv_v8di((__v8di)__A, (__v8di)__B, + (__v8di)__C); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshrdv_v8di_mask((__v8di)__A, (__v8di)__C, + (__v8di)__D, (__mmask8)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_shrdv_epi64(__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshrdv_v8di_maskz((__v8di)__B, (__v8di)__C, + (__v8di)__D, (__mmask8)__A); +} +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vpshldv_v32hi((__v32hi)__A, (__v32hi)__B, + (__v32hi)__C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vpshldv_v16si((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshldv_v16si_mask( + (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_shldv_epi32(__mmask16 __A, __m512i __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshldv_v16si_maskz( + (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vpshldv_v8di((__v8di)__A, (__v8di)__B, + (__v8di)__C); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshldv_v8di_mask((__v8di)__A, (__v8di)__C, + (__v8di)__D, (__mmask8)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_shldv_epi64(__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshldv_v8di_maskz((__v8di)__B, (__v8di)__C, + (__v8di)__D, (__mmask8)__A); +} + +#ifdef __DISABLE_AVX512VBMI2__ +#undef __DISABLE_AVX512VBMI2__ + +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMI2__ */ + +#if !defined(__AVX512VBMI2__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("avx512vbmi2,avx512bw") +#define __DISABLE_AVX512VBMI2BW__ +#endif /* __AVX512VBMI2BW__ */ + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compress_epi8(__m512i __A, __mmask64 __B, __m512i __C) { + return (__m512i)__builtin_ia32_compressqi512_mask((__v64qi)__C, (__v64qi)__A, + (__mmask64)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_compress_epi8(__mmask64 __A, __m512i __B) { + return (__m512i)__builtin_ia32_compressqi512_mask( + (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compressstoreu_epi8(void *__A, __mmask64 __B, __m512i __C) { + __builtin_ia32_compressstoreuqi512_mask((__v64qi *)__A, (__v64qi)__C, + (__mmask64)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compress_epi16(__m512i __A, __mmask32 __B, __m512i __C) { + return (__m512i)__builtin_ia32_compresshi512_mask((__v32hi)__C, (__v32hi)__A, + (__mmask32)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_compress_epi16(__mmask32 __A, __m512i __B) { + return (__m512i)__builtin_ia32_compresshi512_mask( + (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_compressstoreu_epi16(void *__A, __mmask32 __B, __m512i __C) { + __builtin_ia32_compressstoreuhi512_mask((__v32hi *)__A, (__v32hi)__C, + (__mmask32)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expand_epi8(__m512i __A, __mmask64 __B, __m512i __C) { + return (__m512i)__builtin_ia32_expandqi512_mask((__v64qi)__C, (__v64qi)__A, + (__mmask64)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expand_epi8(__mmask64 __A, __m512i __B) { + return (__m512i)__builtin_ia32_expandqi512_maskz( + (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expandloadu_epi8(__m512i __A, __mmask64 __B, const void *__C) { + return (__m512i)__builtin_ia32_expandloadqi512_mask( + (const __v64qi *)__C, (__v64qi)__A, (__mmask64)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expandloadu_epi8(__mmask64 __A, const void *__B) { + return (__m512i)__builtin_ia32_expandloadqi512_maskz( + (const __v64qi *)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expand_epi16(__m512i __A, __mmask32 __B, __m512i __C) { + return (__m512i)__builtin_ia32_expandhi512_mask((__v32hi)__C, (__v32hi)__A, + (__mmask32)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expand_epi16(__mmask32 __A, __m512i __B) { + return (__m512i)__builtin_ia32_expandhi512_maskz( + (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_expandloadu_epi16(__m512i __A, __mmask32 __B, const void *__C) { + return (__m512i)__builtin_ia32_expandloadhi512_mask( + (const __v32hi *)__C, (__v32hi)__A, (__mmask32)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_expandloadu_epi16(__mmask32 __A, const void *__B) { + return (__m512i)__builtin_ia32_expandloadhi512_maskz( + (const __v32hi *)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shrdi_epi16(__m512i __A, __mmask32 __B, __m512i __C, + __m512i __D, int __E) { + return (__m512i)__builtin_ia32_vpshrd_v32hi_mask( + (__v32hi)__C, (__v32hi)__D, __E, (__v32hi)__A, (__mmask32)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shrdi_epi16(__mmask32 __A, __m512i __B, __m512i __C, int __D) { + return (__m512i)__builtin_ia32_vpshrd_v32hi_mask( + (__v32hi)__B, (__v32hi)__C, __D, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_shldi_epi16(__m512i __A, __mmask32 __B, __m512i __C, + __m512i __D, int __E) { + return (__m512i)__builtin_ia32_vpshld_v32hi_mask( + (__v32hi)__C, (__v32hi)__D, __E, (__v32hi)__A, (__mmask32)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_shldi_epi16(__mmask32 __A, __m512i __B, __m512i __C, int __D) { + return (__m512i)__builtin_ia32_vpshld_v32hi_mask( + (__v32hi)__B, (__v32hi)__C, __D, (__v32hi)_mm512_setzero_si512(), + (__mmask32)__A); +} + +#else +#define _mm512_mask_shrdi_epi16(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(C), \ + (__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A),(__mmask32)(B)) +#define _mm512_maskz_shrdi_epi16(A, B, C, D) \ + ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(B), \ + (__v32hi)(__m512i)(C),(int)(D), \ + (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A)) +#define _mm512_mask_shldi_epi16(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(C), \ + (__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A),(__mmask32)(B)) +#define _mm512_maskz_shldi_epi16(A, B, C, D) \ + ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(B), \ + (__v32hi)(__m512i)(C),(int)(D), \ + (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A)) +#endif + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshrdv_v32hi_mask( + (__v32hi)__A, (__v32hi)__C, (__v32hi)__D, (__mmask32)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_shrdv_epi16(__mmask32 __A, __m512i __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshrdv_v32hi_maskz( + (__v32hi)__B, (__v32hi)__C, (__v32hi)__D, (__mmask32)__A); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshldv_v32hi_mask( + (__v32hi)__A, (__v32hi)__C, (__v32hi)__D, (__mmask32)__B); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_shldv_epi16(__mmask32 __A, __m512i __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpshldv_v32hi_maskz( + (__v32hi)__B, (__v32hi)__C, (__v32hi)__D, (__mmask32)__A); +} + +#ifdef __DISABLE_AVX512VBMI2BW__ +#undef __DISABLE_AVX512VBMI2BW__ + +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMI2BW__ */ + +#endif /* __AVX512VBMI2INTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512vbmi2vlintrin.internal.h b/third_party/intel/avx512vbmi2vlintrin.internal.h new file mode 100644 index 000000000..63083a57f --- /dev/null +++ b/third_party/intel/avx512vbmi2vlintrin.internal.h @@ -0,0 +1,866 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include instead." +#endif + +#ifndef _AVX512VBMI2VLINTRIN_H_INCLUDED +#define _AVX512VBMI2VLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) +#pragma GCC push_options +#pragma GCC target("avx512vbmi2,avx512vl") +#define __DISABLE_AVX512VBMI2VL__ +#endif /* __AVX512VBMIVL__ */ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compress_epi8(__m128i __A, __mmask16 __B, __m128i __C) { + return (__m128i)__builtin_ia32_compressqi128_mask((__v16qi)__C, (__v16qi)__A, + (__mmask16)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_compress_epi8(__mmask16 __A, __m128i __B) { + return (__m128i)__builtin_ia32_compressqi128_mask( + (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compressstoreu_epi16(void *__A, __mmask16 __B, __m256i __C) { + __builtin_ia32_compressstoreuhi256_mask((__v16hi *)__A, (__v16hi)__C, + (__mmask16)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compress_epi16(__m128i __A, __mmask8 __B, __m128i __C) { + return (__m128i)__builtin_ia32_compresshi128_mask((__v8hi)__C, (__v8hi)__A, + (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_compress_epi16(__mmask8 __A, __m128i __B) { + return (__m128i)__builtin_ia32_compresshi128_mask( + (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compress_epi16(__m256i __A, __mmask16 __B, __m256i __C) { + return (__m256i)__builtin_ia32_compresshi256_mask((__v16hi)__C, (__v16hi)__A, + (__mmask16)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_compress_epi16(__mmask16 __A, __m256i __B) { + return (__m256i)__builtin_ia32_compresshi256_mask( + (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compressstoreu_epi8(void *__A, __mmask16 __B, __m128i __C) { + __builtin_ia32_compressstoreuqi128_mask((__v16qi *)__A, (__v16qi)__C, + (__mmask16)__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compressstoreu_epi16(void *__A, __mmask8 __B, __m128i __C) { + __builtin_ia32_compressstoreuhi128_mask((__v8hi *)__A, (__v8hi)__C, + (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expand_epi8(__m128i __A, __mmask16 __B, __m128i __C) { + return (__m128i)__builtin_ia32_expandqi128_mask((__v16qi)__C, (__v16qi)__A, + (__mmask16)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expand_epi8(__mmask16 __A, __m128i __B) { + return (__m128i)__builtin_ia32_expandqi128_maskz( + (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expandloadu_epi8(__m128i __A, __mmask16 __B, const void *__C) { + return (__m128i)__builtin_ia32_expandloadqi128_mask( + (const __v16qi *)__C, (__v16qi)__A, (__mmask16)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expandloadu_epi8(__mmask16 __A, const void *__B) { + return (__m128i)__builtin_ia32_expandloadqi128_maskz( + (const __v16qi *)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expand_epi16(__m128i __A, __mmask8 __B, __m128i __C) { + return (__m128i)__builtin_ia32_expandhi128_mask((__v8hi)__C, (__v8hi)__A, + (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expand_epi16(__mmask8 __A, __m128i __B) { + return (__m128i)__builtin_ia32_expandhi128_maskz( + (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expandloadu_epi16(__m128i __A, __mmask8 __B, const void *__C) { + return (__m128i)__builtin_ia32_expandloadhi128_mask( + (const __v8hi *)__C, (__v8hi)__A, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expandloadu_epi16(__mmask8 __A, const void *__B) { + return (__m128i)__builtin_ia32_expandloadhi128_maskz( + (const __v8hi *)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); +} +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expand_epi16(__m256i __A, __mmask16 __B, __m256i __C) { + return (__m256i)__builtin_ia32_expandhi256_mask((__v16hi)__C, (__v16hi)__A, + (__mmask16)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expand_epi16(__mmask16 __A, __m256i __B) { + return (__m256i)__builtin_ia32_expandhi256_maskz( + (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expandloadu_epi16(__m256i __A, __mmask16 __B, const void *__C) { + return (__m256i)__builtin_ia32_expandloadhi256_mask( + (const __v16hi *)__C, (__v16hi)__A, (__mmask16)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expandloadu_epi16(__mmask16 __A, const void *__B) { + return (__m256i)__builtin_ia32_expandloadhi256_maskz( + (const __v16hi *)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shrdi_epi16(__m256i __A, __m256i __B, int __C) { + return (__m256i)__builtin_ia32_vpshrd_v16hi((__v16hi)__A, (__v16hi)__B, __C); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shrdi_epi16(__m256i __A, __mmask16 __B, __m256i __C, + __m256i __D, int __E) { + return (__m256i)__builtin_ia32_vpshrd_v16hi_mask( + (__v16hi)__C, (__v16hi)__D, __E, (__v16hi)__A, (__mmask16)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shrdi_epi16(__mmask16 __A, __m256i __B, __m256i __C, int __D) { + return (__m256i)__builtin_ia32_vpshrd_v16hi_mask( + (__v16hi)__B, (__v16hi)__C, __D, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shrdi_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) { + return (__m256i)__builtin_ia32_vpshrd_v8si_mask((__v8si)__C, (__v8si)__D, __E, + (__v8si)__A, (__mmask8)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shrdi_epi32(__mmask8 __A, __m256i __B, __m256i __C, int __D) { + return (__m256i)__builtin_ia32_vpshrd_v8si_mask( + (__v8si)__B, (__v8si)__C, __D, (__v8si)_mm256_setzero_si256(), + (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shrdi_epi32(__m256i __A, __m256i __B, int __C) { + return (__m256i)__builtin_ia32_vpshrd_v8si((__v8si)__A, (__v8si)__B, __C); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shrdi_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) { + return (__m256i)__builtin_ia32_vpshrd_v4di_mask((__v4di)__C, (__v4di)__D, __E, + (__v4di)__A, (__mmask8)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shrdi_epi64(__mmask8 __A, __m256i __B, __m256i __C, int __D) { + return (__m256i)__builtin_ia32_vpshrd_v4di_mask( + (__v4di)__B, (__v4di)__C, __D, (__v4di)_mm256_setzero_si256(), + (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shrdi_epi64(__m256i __A, __m256i __B, int __C) { + return (__m256i)__builtin_ia32_vpshrd_v4di((__v4di)__A, (__v4di)__B, __C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shrdi_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) { + return (__m128i)__builtin_ia32_vpshrd_v8hi_mask((__v8hi)__C, (__v8hi)__D, __E, + (__v8hi)__A, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shrdi_epi16(__mmask8 __A, __m128i __B, __m128i __C, int __D) { + return (__m128i)__builtin_ia32_vpshrd_v8hi_mask((__v8hi)__B, (__v8hi)__C, __D, + (__v8hi)_mm_setzero_si128(), + (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shrdi_epi16(__m128i __A, __m128i __B, int __C) { + return (__m128i)__builtin_ia32_vpshrd_v8hi((__v8hi)__A, (__v8hi)__B, __C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shrdi_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) { + return (__m128i)__builtin_ia32_vpshrd_v4si_mask((__v4si)__C, (__v4si)__D, __E, + (__v4si)__A, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shrdi_epi32(__mmask8 __A, __m128i __B, __m128i __C, int __D) { + return (__m128i)__builtin_ia32_vpshrd_v4si_mask((__v4si)__B, (__v4si)__C, __D, + (__v4si)_mm_setzero_si128(), + (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shrdi_epi32(__m128i __A, __m128i __B, int __C) { + return (__m128i)__builtin_ia32_vpshrd_v4si((__v4si)__A, (__v4si)__B, __C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shrdi_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) { + return (__m128i)__builtin_ia32_vpshrd_v2di_mask((__v2di)__C, (__v2di)__D, __E, + (__v2di)__A, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shrdi_epi64(__mmask8 __A, __m128i __B, __m128i __C, int __D) { + return (__m128i)__builtin_ia32_vpshrd_v2di_mask((__v2di)__B, (__v2di)__C, __D, + (__v2di)_mm_setzero_si128(), + (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shrdi_epi64(__m128i __A, __m128i __B, int __C) { + return (__m128i)__builtin_ia32_vpshrd_v2di((__v2di)__A, (__v2di)__B, __C); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shldi_epi16(__m256i __A, __m256i __B, int __C) { + return (__m256i)__builtin_ia32_vpshld_v16hi((__v16hi)__A, (__v16hi)__B, __C); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shldi_epi16(__m256i __A, __mmask16 __B, __m256i __C, + __m256i __D, int __E) { + return (__m256i)__builtin_ia32_vpshld_v16hi_mask( + (__v16hi)__C, (__v16hi)__D, __E, (__v16hi)__A, (__mmask16)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shldi_epi16(__mmask16 __A, __m256i __B, __m256i __C, int __D) { + return (__m256i)__builtin_ia32_vpshld_v16hi_mask( + (__v16hi)__B, (__v16hi)__C, __D, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shldi_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) { + return (__m256i)__builtin_ia32_vpshld_v8si_mask((__v8si)__C, (__v8si)__D, __E, + (__v8si)__A, (__mmask8)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shldi_epi32(__mmask8 __A, __m256i __B, __m256i __C, int __D) { + return (__m256i)__builtin_ia32_vpshld_v8si_mask( + (__v8si)__B, (__v8si)__C, __D, (__v8si)_mm256_setzero_si256(), + (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shldi_epi32(__m256i __A, __m256i __B, int __C) { + return (__m256i)__builtin_ia32_vpshld_v8si((__v8si)__A, (__v8si)__B, __C); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shldi_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) { + return (__m256i)__builtin_ia32_vpshld_v4di_mask((__v4di)__C, (__v4di)__D, __E, + (__v4di)__A, (__mmask8)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shldi_epi64(__mmask8 __A, __m256i __B, __m256i __C, int __D) { + return (__m256i)__builtin_ia32_vpshld_v4di_mask( + (__v4di)__B, (__v4di)__C, __D, (__v4di)_mm256_setzero_si256(), + (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shldi_epi64(__m256i __A, __m256i __B, int __C) { + return (__m256i)__builtin_ia32_vpshld_v4di((__v4di)__A, (__v4di)__B, __C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shldi_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) { + return (__m128i)__builtin_ia32_vpshld_v8hi_mask((__v8hi)__C, (__v8hi)__D, __E, + (__v8hi)__A, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shldi_epi16(__mmask8 __A, __m128i __B, __m128i __C, int __D) { + return (__m128i)__builtin_ia32_vpshld_v8hi_mask((__v8hi)__B, (__v8hi)__C, __D, + (__v8hi)_mm_setzero_si128(), + (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shldi_epi16(__m128i __A, __m128i __B, int __C) { + return (__m128i)__builtin_ia32_vpshld_v8hi((__v8hi)__A, (__v8hi)__B, __C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shldi_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) { + return (__m128i)__builtin_ia32_vpshld_v4si_mask((__v4si)__C, (__v4si)__D, __E, + (__v4si)__A, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shldi_epi32(__mmask8 __A, __m128i __B, __m128i __C, int __D) { + return (__m128i)__builtin_ia32_vpshld_v4si_mask((__v4si)__B, (__v4si)__C, __D, + (__v4si)_mm_setzero_si128(), + (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shldi_epi32(__m128i __A, __m128i __B, int __C) { + return (__m128i)__builtin_ia32_vpshld_v4si((__v4si)__A, (__v4si)__B, __C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shldi_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) { + return (__m128i)__builtin_ia32_vpshld_v2di_mask((__v2di)__C, (__v2di)__D, __E, + (__v2di)__A, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shldi_epi64(__mmask8 __A, __m128i __B, __m128i __C, int __D) { + return (__m128i)__builtin_ia32_vpshld_v2di_mask((__v2di)__B, (__v2di)__C, __D, + (__v2di)_mm_setzero_si128(), + (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shldi_epi64(__m128i __A, __m128i __B, int __C) { + return (__m128i)__builtin_ia32_vpshld_v2di((__v2di)__A, (__v2di)__B, __C); +} +#else +#define _mm256_shrdi_epi16(A, B, C) \ + ((__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B),(int)(C)) +#define _mm256_mask_shrdi_epi16(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(C), \ + (__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A),(__mmask16)(B)) +#define _mm256_maskz_shrdi_epi16(A, B, C, D) \ + ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B), \ + (__v16hi)(__m256i)(C),(int)(D), \ + (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A)) +#define _mm256_shrdi_epi32(A, B, C) \ + ((__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B),(int)(C)) +#define _mm256_mask_shrdi_epi32(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(C), \ + (__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A),(__mmask8)(B)) +#define _mm256_maskz_shrdi_epi32(A, B, C, D) \ + ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B), \ + (__v8si)(__m256i)(C),(int)(D), \ + (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) +#define _mm256_shrdi_epi64(A, B, C) \ + ((__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B),(int)(C)) +#define _mm256_mask_shrdi_epi64(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(C), \ + (__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A),(__mmask8)(B)) +#define _mm256_maskz_shrdi_epi64(A, B, C, D) \ + ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B), \ + (__v4di)(__m256i)(C),(int)(D), \ + (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) +#define _mm_shrdi_epi16(A, B, C) \ + ((__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B),(int)(C)) +#define _mm_mask_shrdi_epi16(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(C), \ + (__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A),(__mmask8)(B)) +#define _mm_maskz_shrdi_epi16(A, B, C, D) \ + ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B), \ + (__v8hi)(__m128i)(C),(int)(D), \ + (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) +#define _mm_shrdi_epi32(A, B, C) \ + ((__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B),(int)(C)) +#define _mm_mask_shrdi_epi32(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C), \ + (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask8)(B)) +#define _mm_maskz_shrdi_epi32(A, B, C, D) \ + ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B), \ + (__v4si)(__m128i)(C),(int)(D), \ + (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) +#define _mm_shrdi_epi64(A, B, C) \ + ((__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B),(int)(C)) +#define _mm_mask_shrdi_epi64(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(C), \ + (__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A),(__mmask8)(B)) +#define _mm_maskz_shrdi_epi64(A, B, C, D) \ + ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B), \ + (__v2di)(__m128i)(C),(int)(D), \ + (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) +#define _mm256_shldi_epi16(A, B, C) \ + ((__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B),(int)(C)) +#define _mm256_mask_shldi_epi16(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(C), \ + (__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A),(__mmask16)(B)) +#define _mm256_maskz_shldi_epi16(A, B, C, D) \ + ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B), \ + (__v16hi)(__m256i)(C),(int)(D), \ + (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A)) +#define _mm256_shldi_epi32(A, B, C) \ + ((__m256i) __builtin_ia32_vpshld_v8si ((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B),(int)(C)) +#define _mm256_mask_shldi_epi32(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(C), \ + (__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A),(__mmask8)(B)) +#define _mm256_maskz_shldi_epi32(A, B, C, D) \ + ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B), \ + (__v8si)(__m256i)(C),(int)(D), \ + (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) +#define _mm256_shldi_epi64(A, B, C) \ + ((__m256i) __builtin_ia32_vpshld_v4di ((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B),(int)(C)) +#define _mm256_mask_shldi_epi64(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(C), \ + (__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A),(__mmask8)(B)) +#define _mm256_maskz_shldi_epi64(A, B, C, D) \ + ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B), \ + (__v4di)(__m256i)(C),(int)(D), \ + (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)) +#define _mm_shldi_epi16(A, B, C) \ + ((__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B),(int)(C)) +#define _mm_mask_shldi_epi16(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(C), \ + (__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A),(__mmask8)(B)) +#define _mm_maskz_shldi_epi16(A, B, C, D) \ + ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B), \ + (__v8hi)(__m128i)(C),(int)(D), \ + (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) +#define _mm_shldi_epi32(A, B, C) \ + ((__m128i) __builtin_ia32_vpshld_v4si ((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B),(int)(C)) +#define _mm_mask_shldi_epi32(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), \ + (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask8)(B)) +#define _mm_maskz_shldi_epi32(A, B, C, D) \ + ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B), \ + (__v4si)(__m128i)(C),(int)(D), \ + (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) +#define _mm_shldi_epi64(A, B, C) \ + ((__m128i) __builtin_ia32_vpshld_v2di ((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B),(int)(C)) +#define _mm_mask_shldi_epi64(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(C), \ + (__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A),(__mmask8)(B)) +#define _mm_maskz_shldi_epi64(A, B, C, D) \ + ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B), \ + (__v2di)(__m128i)(C),(int)(D), \ + (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) +#endif + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vpshrdv_v16hi((__v16hi)__A, (__v16hi)__B, + (__v16hi)__C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshrdv_v16hi_mask( + (__v16hi)__A, (__v16hi)__C, (__v16hi)__D, (__mmask16)__B); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_shrdv_epi16(__mmask16 __A, __m256i __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshrdv_v16hi_maskz( + (__v16hi)__B, (__v16hi)__C, (__v16hi)__D, (__mmask16)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vpshrdv_v8si((__v8si)__A, (__v8si)__B, + (__v8si)__C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshrdv_v8si_mask((__v8si)__A, (__v8si)__C, + (__v8si)__D, (__mmask8)__B); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_shrdv_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshrdv_v8si_maskz((__v8si)__B, (__v8si)__C, + (__v8si)__D, (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vpshrdv_v4di((__v4di)__A, (__v4di)__B, + (__v4di)__C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshrdv_v4di_mask((__v4di)__A, (__v4di)__C, + (__v4di)__D, (__mmask8)__B); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_shrdv_epi64(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshrdv_v4di_maskz((__v4di)__B, (__v4di)__C, + (__v4di)__D, (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpshrdv_v8hi((__v8hi)__A, (__v8hi)__B, + (__v8hi)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shrdv_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshrdv_v8hi_mask((__v8hi)__A, (__v8hi)__C, + (__v8hi)__D, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shrdv_epi16(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshrdv_v8hi_maskz((__v8hi)__B, (__v8hi)__C, + (__v8hi)__D, (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpshrdv_v4si((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shrdv_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshrdv_v4si_mask((__v4si)__A, (__v4si)__C, + (__v4si)__D, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shrdv_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshrdv_v4si_maskz((__v4si)__B, (__v4si)__C, + (__v4si)__D, (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpshrdv_v2di((__v2di)__A, (__v2di)__B, + (__v2di)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shrdv_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshrdv_v2di_mask((__v2di)__A, (__v2di)__C, + (__v2di)__D, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shrdv_epi64(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshrdv_v2di_maskz((__v2di)__B, (__v2di)__C, + (__v2di)__D, (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vpshldv_v16hi((__v16hi)__A, (__v16hi)__B, + (__v16hi)__C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshldv_v16hi_mask( + (__v16hi)__A, (__v16hi)__C, (__v16hi)__D, (__mmask16)__B); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_shldv_epi16(__mmask16 __A, __m256i __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshldv_v16hi_maskz( + (__v16hi)__B, (__v16hi)__C, (__v16hi)__D, (__mmask16)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vpshldv_v8si((__v8si)__A, (__v8si)__B, + (__v8si)__C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshldv_v8si_mask((__v8si)__A, (__v8si)__C, + (__v8si)__D, (__mmask8)__B); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_shldv_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshldv_v8si_maskz((__v8si)__B, (__v8si)__C, + (__v8si)__D, (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vpshldv_v4di((__v4di)__A, (__v4di)__B, + (__v4di)__C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshldv_v4di_mask((__v4di)__A, (__v4di)__C, + (__v4di)__D, (__mmask8)__B); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_shldv_epi64(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpshldv_v4di_maskz((__v4di)__B, (__v4di)__C, + (__v4di)__D, (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpshldv_v8hi((__v8hi)__A, (__v8hi)__B, + (__v8hi)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shldv_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshldv_v8hi_mask((__v8hi)__A, (__v8hi)__C, + (__v8hi)__D, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shldv_epi16(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshldv_v8hi_maskz((__v8hi)__B, (__v8hi)__C, + (__v8hi)__D, (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpshldv_v4si((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shldv_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshldv_v4si_mask((__v4si)__A, (__v4si)__C, + (__v4si)__D, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shldv_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshldv_v4si_maskz((__v4si)__B, (__v4si)__C, + (__v4si)__D, (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpshldv_v2di((__v2di)__A, (__v2di)__B, + (__v2di)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shldv_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshldv_v2di_mask((__v2di)__A, (__v2di)__C, + (__v2di)__D, (__mmask8)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shldv_epi64(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpshldv_v2di_maskz((__v2di)__B, (__v2di)__C, + (__v2di)__D, (__mmask8)__A); +} + +#ifdef __DISABLE_AVX512VBMI2VL__ +#undef __DISABLE_AVX512VBMI2VL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMIVL__ */ + +#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || \ + !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("avx512vbmi2,avx512vl,avx512bw") +#define __DISABLE_AVX512VBMI2VLBW__ +#endif /* __AVX512VBMIVLBW__ */ + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compress_epi8(__m256i __A, __mmask32 __B, __m256i __C) { + return (__m256i)__builtin_ia32_compressqi256_mask((__v32qi)__C, (__v32qi)__A, + (__mmask32)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_compress_epi8(__mmask32 __A, __m256i __B) { + return (__m256i)__builtin_ia32_compressqi256_mask( + (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compressstoreu_epi8(void *__A, __mmask32 __B, __m256i __C) { + __builtin_ia32_compressstoreuqi256_mask((__v32qi *)__A, (__v32qi)__C, + (__mmask32)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expand_epi8(__m256i __A, __mmask32 __B, __m256i __C) { + return (__m256i)__builtin_ia32_expandqi256_mask((__v32qi)__C, (__v32qi)__A, + (__mmask32)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expand_epi8(__mmask32 __A, __m256i __B) { + return (__m256i)__builtin_ia32_expandqi256_maskz( + (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expandloadu_epi8(__m256i __A, __mmask32 __B, const void *__C) { + return (__m256i)__builtin_ia32_expandloadqi256_mask( + (const __v32qi *)__C, (__v32qi)__A, (__mmask32)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expandloadu_epi8(__mmask32 __A, const void *__B) { + return (__m256i)__builtin_ia32_expandloadqi256_maskz( + (const __v32qi *)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A); +} + +#ifdef __DISABLE_AVX512VBMI2VLBW__ +#undef __DISABLE_AVX512VBMI2VLBW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMIVLBW__ */ + +#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512vbmiintrin.internal.h b/third_party/intel/avx512vbmiintrin.internal.h new file mode 100644 index 000000000..e0b4f1f71 --- /dev/null +++ b/third_party/intel/avx512vbmiintrin.internal.h @@ -0,0 +1,107 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VBMIINTRIN_H_INCLUDED +#define _AVX512VBMIINTRIN_H_INCLUDED + +#ifndef __AVX512VBMI__ +#pragma GCC push_options +#pragma GCC target("avx512vbmi") +#define __DISABLE_AVX512VBMI__ +#endif /* __AVX512VBMI__ */ + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X, + __m512i __Y) { + return (__m512i)__builtin_ia32_vpmultishiftqb512_mask( + (__v64qi)__X, (__v64qi)__Y, (__v64qi)__W, (__mmask64)__M); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_vpmultishiftqb512_mask( + (__v64qi)__X, (__v64qi)__Y, (__v64qi)_mm512_setzero_si512(), + (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_vpmultishiftqb512_mask( + (__v64qi)__X, (__v64qi)__Y, (__v64qi)_mm512_undefined_epi32(), + (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutexvar_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_permvarqi512_mask( + (__v64qi)__B, (__v64qi)__A, (__v64qi)_mm512_undefined_epi32(), + (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_permvarqi512_mask( + (__v64qi)__B, (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), + (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { + return (__m512i)__builtin_ia32_permvarqi512_mask( + (__v64qi)__B, (__v64qi)__A, (__v64qi)__W, (__mmask64)__M); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2varqi512_mask( + (__v64qi)__I + /* idx */, + (__v64qi)__A, (__v64qi)__B, (__mmask64)-1); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2varqi512_mask( + (__v64qi)__I + /* idx */, + (__v64qi)__A, (__v64qi)__B, (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermi2varqi512_mask((__v64qi)__A, + (__v64qi)__I + /* idx */, + (__v64qi)__B, + (__mmask64)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, + __m512i __B) { + return (__m512i)__builtin_ia32_vpermt2varqi512_maskz( + (__v64qi)__I + /* idx */, + (__v64qi)__A, (__v64qi)__B, (__mmask64)__U); +} + +#ifdef __DISABLE_AVX512VBMI__ +#undef __DISABLE_AVX512VBMI__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMI__ */ + +#endif /* _AVX512VBMIINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512vbmivlintrin.internal.h b/third_party/intel/avx512vbmivlintrin.internal.h new file mode 100644 index 000000000..53db4b9ab --- /dev/null +++ b/third_party/intel/avx512vbmivlintrin.internal.h @@ -0,0 +1,194 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include instead." +#endif + +#ifndef _AVX512VBMIVLINTRIN_H_INCLUDED +#define _AVX512VBMIVLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512VBMI__) +#pragma GCC push_options +#pragma GCC target("avx512vbmi,avx512vl") +#define __DISABLE_AVX512VBMIVL__ +#endif /* __AVX512VBMIVL__ */ + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X, + __m256i __Y) { + return (__m256i)__builtin_ia32_vpmultishiftqb256_mask( + (__v32qi)__X, (__v32qi)__Y, (__v32qi)__W, (__mmask32)__M); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_vpmultishiftqb256_mask( + (__v32qi)__X, (__v32qi)__Y, (__v32qi)_mm256_setzero_si256(), + (__mmask32)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_vpmultishiftqb256_mask( + (__v32qi)__X, (__v32qi)__Y, (__v32qi)_mm256_undefined_si256(), + (__mmask32)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, + __m128i __Y) { + return (__m128i)__builtin_ia32_vpmultishiftqb128_mask( + (__v16qi)__X, (__v16qi)__Y, (__v16qi)__W, (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_vpmultishiftqb128_mask( + (__v16qi)__X, (__v16qi)__Y, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_vpmultishiftqb128_mask( + (__v16qi)__X, (__v16qi)__Y, (__v16qi)_mm_undefined_si128(), + (__mmask16)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutexvar_epi8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_permvarqi256_mask( + (__v32qi)__B, (__v32qi)__A, (__v32qi)_mm256_undefined_si256(), + (__mmask32)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_permvarqi256_mask( + (__v32qi)__B, (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), + (__mmask32)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_permvarqi256_mask( + (__v32qi)__B, (__v32qi)__A, (__v32qi)__W, (__mmask32)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permutexvar_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_permvarqi128_mask( + (__v16qi)__B, (__v16qi)__A, (__v16qi)_mm_undefined_si128(), + (__mmask16)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_permvarqi128_mask( + (__v16qi)__B, (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_permvarqi128_mask( + (__v16qi)__B, (__v16qi)__A, (__v16qi)__W, (__mmask16)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2varqi256_mask( + (__v32qi)__I + /* idx */, + (__v32qi)__A, (__v32qi)__B, (__mmask32)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2varqi256_mask( + (__v32qi)__I + /* idx */, + (__v32qi)__A, (__v32qi)__B, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermi2varqi256_mask((__v32qi)__A, + (__v32qi)__I + /* idx */, + (__v32qi)__B, + (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2varqi256_maskz( + (__v32qi)__I + /* idx */, + (__v32qi)__A, (__v32qi)__B, (__mmask32)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2varqi128_mask( + (__v16qi)__I + /* idx */, + (__v16qi)__A, (__v16qi)__B, (__mmask16)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2varqi128_mask( + (__v16qi)__I + /* idx */, + (__v16qi)__A, (__v16qi)__B, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermi2varqi128_mask((__v16qi)__A, + (__v16qi)__I + /* idx */, + (__v16qi)__B, + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2varqi128_maskz( + (__v16qi)__I + /* idx */, + (__v16qi)__A, (__v16qi)__B, (__mmask16)__U); +} + +#ifdef __DISABLE_AVX512VBMIVL__ +#undef __DISABLE_AVX512VBMIVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMIVL__ */ + +#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512vlbwintrin.internal.h b/third_party/intel/avx512vlbwintrin.internal.h new file mode 100644 index 000000000..836367405 --- /dev/null +++ b/third_party/intel/avx512vlbwintrin.internal.h @@ -0,0 +1,3388 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VLBWINTRIN_H_INCLUDED +#define _AVX512VLBWINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("avx512vl,avx512bw") +#define __DISABLE_AVX512VLBW__ +#endif /* __AVX512VLBW__ */ + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_mov_epi8(__m256i __W, __mmask32 __U, __m256i __A) { + return (__m256i)__builtin_ia32_movdquqi256_mask((__v32qi)__A, (__v32qi)__W, + (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) { + return (__m256i)__builtin_ia32_movdquqi256_mask( + (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mov_epi8(__m128i __W, __mmask16 __U, __m128i __A) { + return (__m128i)__builtin_ia32_movdquqi128_mask((__v16qi)__A, (__v16qi)__W, + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mov_epi8(__mmask16 __U, __m128i __A) { + return (__m128i)__builtin_ia32_movdquqi128_mask( + (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_storeu_epi8(void *__P, __mmask32 __U, __m256i __A) { + __builtin_ia32_storedquqi256_mask((char *)__P, (__v32qi)__A, (__mmask32)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_storeu_epi8(void *__P, __mmask16 __U, __m128i __A) { + __builtin_ia32_storedquqi128_mask((char *)__P, (__v16qi)__A, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_loadu_epi16(__m256i __W, __mmask16 __U, void const *__P) { + return (__m256i)__builtin_ia32_loaddquhi256_mask( + (const short *)__P, (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_loadu_epi16(__mmask16 __U, void const *__P) { + return (__m256i)__builtin_ia32_loaddquhi256_mask( + (const short *)__P, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_loadu_epi16(__m128i __W, __mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_loaddquhi128_mask((const short *)__P, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_loadu_epi16(__mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_loaddquhi128_mask( + (const short *)__P, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_mov_epi16(__m256i __W, __mmask16 __U, __m256i __A) { + return (__m256i)__builtin_ia32_movdquhi256_mask((__v16hi)__A, (__v16hi)__W, + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mov_epi16(__mmask16 __U, __m256i __A) { + return (__m256i)__builtin_ia32_movdquhi256_mask( + (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mov_epi16(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_movdquhi128_mask((__v8hi)__A, (__v8hi)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mov_epi16(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_movdquhi128_mask( + (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_loadu_epi8(__m256i __W, __mmask32 __U, void const *__P) { + return (__m256i)__builtin_ia32_loaddquqi256_mask( + (const char *)__P, (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_loadu_epi8(__mmask32 __U, void const *__P) { + return (__m256i)__builtin_ia32_loaddquqi256_mask( + (const char *)__P, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_loadu_epi8(__m128i __W, __mmask16 __U, void const *__P) { + return (__m128i)__builtin_ia32_loaddquqi128_mask( + (const char *)__P, (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_loadu_epi8(__mmask16 __U, void const *__P) { + return (__m128i)__builtin_ia32_loaddquqi128_mask( + (const char *)__P, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi16_epi8(__m256i __A) { + + return (__m128i)__builtin_ia32_pmovwb256_mask( + (__v16hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi16_storeu_epi8(void *__P, __mmask16 __M, __m256i __A) { + __builtin_ia32_pmovwb256mem_mask((__v16qi *)__P, (__v16hi)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi16_epi8(__m128i __O, __mmask16 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovwb256_mask((__v16hi)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi16_epi8(__mmask16 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovwb256_mask( + (__v16hi)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsepi16_epi8(__m128i __A) { + + return (__m128i)__builtin_ia32_pmovswb128_mask( + (__v8hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi16_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovswb128mem_mask((__v8qi *)__P, (__v8hi)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovswb128_mask((__v8hi)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtsepi16_epi8(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovswb128_mask( + (__v8hi)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtsepi16_epi8(__m256i __A) { + + return (__m128i)__builtin_ia32_pmovswb256_mask( + (__v16hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi16_storeu_epi8(void *__P, __mmask16 __M, __m256i __A) { + __builtin_ia32_pmovswb256mem_mask((__v16qi *)__P, (__v16hi)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi16_epi8(__m128i __O, __mmask16 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovswb256_mask((__v16hi)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtsepi16_epi8(__mmask16 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovswb256_mask( + (__v16hi)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtusepi16_epi8(__m128i __A) { + + return (__m128i)__builtin_ia32_pmovuswb128_mask( + (__v8hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi16_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovuswb128mem_mask((__v8qi *)__P, (__v8hi)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovuswb128_mask((__v8hi)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtusepi16_epi8(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovuswb128_mask( + (__v8hi)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtusepi16_epi8(__m256i __A) { + + return (__m128i)__builtin_ia32_pmovuswb256_mask( + (__v16hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi16_storeu_epi8(void *__P, __mmask16 __M, __m256i __A) { + __builtin_ia32_pmovuswb256mem_mask((__v16qi *)__P, (__v16hi)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi16_epi8(__m128i __O, __mmask16 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovuswb256_mask((__v16hi)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtusepi16_epi8(__mmask16 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovuswb256_mask( + (__v16hi)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A) { + return (__m256i)__builtin_ia32_pbroadcastb256_mask((__v16qi)__A, (__v32qi)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) { + return (__m256i)__builtin_ia32_pbroadcastb256_mask( + (__v16qi)__A, (__v32qi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { + return (__m256i)__builtin_ia32_pbroadcastb256_gpr_mask(__A, (__v32qi)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_set1_epi8(__mmask32 __M, char __A) { + return (__m256i)__builtin_ia32_pbroadcastb256_gpr_mask( + __A, (__v32qi)_mm256_setzero_si256(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pbroadcastb128_mask((__v16qi)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pbroadcastb128_mask( + (__v16qi)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_set1_epi8(__m128i __O, __mmask16 __M, char __A) { + return (__m128i)__builtin_ia32_pbroadcastb128_gpr_mask(__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_set1_epi8(__mmask16 __M, char __A) { + return (__m128i)__builtin_ia32_pbroadcastb128_gpr_mask( + __A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A) { + return (__m256i)__builtin_ia32_pbroadcastw256_mask((__v8hi)__A, (__v16hi)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) { + return (__m256i)__builtin_ia32_pbroadcastw256_mask( + (__v8hi)__A, (__v16hi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) { + return (__m256i)__builtin_ia32_pbroadcastw256_gpr_mask(__A, (__v16hi)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_set1_epi16(__mmask16 __M, short __A) { + return (__m256i)__builtin_ia32_pbroadcastw256_gpr_mask( + __A, (__v16hi)_mm256_setzero_si256(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pbroadcastw128_mask((__v8hi)__A, (__v8hi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pbroadcastw128_mask( + (__v8hi)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) { + return (__m128i)__builtin_ia32_pbroadcastw128_gpr_mask(__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_set1_epi16(__mmask8 __M, short __A) { + return (__m128i)__builtin_ia32_pbroadcastw128_gpr_mask( + __A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutexvar_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_permvarhi256_mask( + (__v16hi)__B, (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), + (__mmask16)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_permvarhi256_mask( + (__v16hi)__B, (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_permvarhi256_mask( + (__v16hi)__B, (__v16hi)__A, (__v16hi)__W, (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permutexvar_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_permvarhi128_mask( + (__v8hi)__B, (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_permvarhi128_mask( + (__v8hi)__B, (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { + return (__m128i)__builtin_ia32_permvarhi128_mask((__v8hi)__B, (__v8hi)__A, + (__v8hi)__W, (__mmask8)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2varhi256_mask( + (__v16hi)__I + /* idx */, + (__v16hi)__A, (__v16hi)__B, (__mmask16)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2varhi256_mask( + (__v16hi)__I + /* idx */, + (__v16hi)__A, (__v16hi)__B, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermi2varhi256_mask((__v16hi)__A, + (__v16hi)__I + /* idx */, + (__v16hi)__B, + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2varhi256_maskz( + (__v16hi)__I + /* idx */, + (__v16hi)__A, (__v16hi)__B, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2varhi128_mask((__v8hi)__I + /* idx */, + (__v8hi)__A, (__v8hi)__B, + (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2varhi128_mask((__v8hi)__I + /* idx */, + (__v8hi)__A, (__v8hi)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermi2varhi128_mask((__v8hi)__A, + (__v8hi)__I + /* idx */, + (__v8hi)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2varhi128_maskz((__v8hi)__I + /* idx */, + (__v8hi)__A, (__v8hi)__B, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, + __m256i __Y) { + return (__m256i)__builtin_ia32_pmaddubsw256_mask( + (__v32qi)__X, (__v32qi)__Y, (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pmaddubsw256_mask( + (__v32qi)__X, (__v32qi)__Y, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmaddubsw128_mask((__v16qi)__X, (__v16qi)__Y, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmaddubsw128_mask( + (__v16qi)__X, (__v16qi)__Y, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaddwd256_mask((__v16hi)__A, (__v16hi)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaddwd256_mask((__v16hi)__A, (__v16hi)__B, + (__v8si)_mm256_setzero_si256(), + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaddwd128_mask((__v8hi)__A, (__v8hi)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaddwd128_mask( + (__v8hi)__A, (__v8hi)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movepi8_mask(__m128i __A) { + return (__mmask16)__builtin_ia32_cvtb2mask128((__v16qi)__A); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movepi8_mask(__m256i __A) { + return (__mmask32)__builtin_ia32_cvtb2mask256((__v32qi)__A); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movepi16_mask(__m128i __A) { + return (__mmask8)__builtin_ia32_cvtw2mask128((__v8hi)__A); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movepi16_mask(__m256i __A) { + return (__mmask16)__builtin_ia32_cvtw2mask256((__v16hi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movm_epi8(__mmask16 __A) { + return (__m128i)__builtin_ia32_cvtmask2b128(__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movm_epi8(__mmask32 __A) { + return (__m256i)__builtin_ia32_cvtmask2b256(__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movm_epi16(__mmask8 __A) { + return (__m128i)__builtin_ia32_cvtmask2w128(__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movm_epi16(__mmask16 __A) { + return (__m256i)__builtin_ia32_cvtmask2w256(__A); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_test_epi8_mask(__m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_ptestmb128((__v16qi)__A, (__v16qi)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_ptestmb128((__v16qi)__A, (__v16qi)__B, __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_test_epi8_mask(__m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_ptestmb256((__v32qi)__A, (__v32qi)__B, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_ptestmb256((__v32qi)__A, (__v32qi)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_test_epi16_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestmw128((__v8hi)__A, (__v8hi)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_test_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestmw128((__v8hi)__A, (__v8hi)__B, __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_test_epi16_mask(__m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_ptestmw256((__v16hi)__A, (__v16hi)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_test_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_ptestmw256((__v16hi)__A, (__v16hi)__B, __U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminuw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__M); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminuw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminuw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminuw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__M); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_max_epu8(__mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxub256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)_mm256_setzero_si256(), + (__mmask32)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxub256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxub128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxub128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)_mm256_setzero_si256(), + (__mmask32)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsb128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsb128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_min_epu8(__mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminub256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)_mm256_setzero_si256(), + (__mmask32)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminub256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminub128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminub128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)_mm256_setzero_si256(), + (__mmask32)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminsb128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminsb128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__M); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxuw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__M); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxuw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxuw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxuw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminsw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminsw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B, const int __N) { + return (__m256i)__builtin_ia32_palignr256_mask( + (__v4di)__A, (__v4di)__B, __N * 8, (__v4di)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B, + const int __N) { + return (__m256i)__builtin_ia32_palignr256_mask( + (__v4di)__A, (__v4di)__B, __N * 8, (__v4di)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B, + const int __N) { + return (__m128i)__builtin_ia32_palignr128_mask( + (__v2di)__A, (__v2di)__B, __N * 8, (__v2di)__W, (__mmask16)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B, const int __N) { + return (__m128i)__builtin_ia32_palignr128_mask( + (__v2di)__A, (__v2di)__B, __N * 8, (__v2di)_mm_setzero_si128(), + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_dbsad_epu8(__m256i __A, __m256i __B, const int __imm) { + return (__m256i)__builtin_ia32_dbpsadbw256_mask( + (__v32qi)__A, (__v32qi)__B, __imm, (__v16hi)_mm256_setzero_si256(), + (__mmask16)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B, + const int __imm) { + return (__m256i)__builtin_ia32_dbpsadbw256_mask( + (__v32qi)__A, (__v32qi)__B, __imm, (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B, + const int __imm) { + return (__m256i)__builtin_ia32_dbpsadbw256_mask( + (__v32qi)__A, (__v32qi)__B, __imm, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_dbsad_epu8(__m128i __A, __m128i __B, const int __imm) { + return (__m128i)__builtin_ia32_dbpsadbw128_mask( + (__v16qi)__A, (__v16qi)__B, __imm, (__v8hi)_mm_setzero_si128(), + (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B, + const int __imm) { + return (__m128i)__builtin_ia32_dbpsadbw128_mask( + (__v16qi)__A, (__v16qi)__B, __imm, (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B, const int __imm) { + return (__m128i)__builtin_ia32_dbpsadbw128_mask( + (__v16qi)__A, (__v16qi)__B, __imm, (__v8hi)_mm_setzero_si128(), + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) { + return (__m128i)__builtin_ia32_blendmw_128_mask((__v8hi)__A, (__v8hi)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) { + return (__m128i)__builtin_ia32_blendmb_128_mask((__v16qi)__A, (__v16qi)__W, + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) { + return (__m256i)__builtin_ia32_blendmw_256_mask((__v16hi)__A, (__v16hi)__W, + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) { + return (__m256i)__builtin_ia32_blendmb_256_mask((__v32qi)__A, (__v32qi)__W, + (__mmask32)__U); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_cmp_epi16_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_epi16_mask(__m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmp_epi16_mask(__mmask16 __U, __m256i __X, __m256i __Y, + const int __P) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, __P, + (__mmask16)__U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_epi16_mask(__m256i __X, __m256i __Y, const int __P) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, __P, + (__mmask16)-1); +} + +extern __inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_cmp_epi8_mask(__mmask16 __U, __m128i __X, __m128i __Y, const int __P) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, __P, + (__mmask16)__U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_epi8_mask(__m128i __X, __m128i __Y, const int __P) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, __P, + (__mmask16)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmp_epi8_mask(__mmask32 __U, __m256i __X, __m256i __Y, + const int __P) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, __P, + (__mmask32)__U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_epi8_mask(__m256i __X, __m256i __Y, const int __P) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, __P, + (__mmask32)-1); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_cmp_epu16_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_epu16_mask(__m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmp_epu16_mask(__mmask16 __U, __m256i __X, __m256i __Y, + const int __P) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, + __P, (__mmask16)__U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_epu16_mask(__m256i __X, __m256i __Y, const int __P) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, + __P, (__mmask16)-1); +} + +extern __inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_cmp_epu8_mask(__mmask16 __U, __m128i __X, __m128i __Y, const int __P) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, + __P, (__mmask16)__U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_epu8_mask(__m128i __X, __m128i __Y, const int __P) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, + __P, (__mmask16)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmp_epu8_mask(__mmask32 __U, __m256i __X, __m256i __Y, + const int __P) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, + __P, (__mmask32)__U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_epu8_mask(__m256i __X, __m256i __Y, const int __P) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, + __P, (__mmask32)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) { + return (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)__A, __imm, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, const int __imm) { + return (__m256i)__builtin_ia32_psrlwi256_mask( + (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)__A, __imm, (__v8hi)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srli_epi16(__mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psrlwi128_mask( + (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shufflehi_epi16(__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) { + return (__m256i)__builtin_ia32_pshufhw256_mask((__v16hi)__A, __imm, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shufflehi_epi16(__mmask16 __U, __m256i __A, const int __imm) { + return (__m256i)__builtin_ia32_pshufhw256_mask( + (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shufflehi_epi16(__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) { + return (__m128i)__builtin_ia32_pshufhw128_mask((__v8hi)__A, __imm, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shufflehi_epi16(__mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_pshufhw128_mask( + (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shufflelo_epi16(__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) { + return (__m256i)__builtin_ia32_pshuflw256_mask((__v16hi)__A, __imm, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A, const int __imm) { + return (__m256i)__builtin_ia32_pshuflw256_mask( + (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shufflelo_epi16(__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) { + return (__m128i)__builtin_ia32_pshuflw128_mask((__v8hi)__A, __imm, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shufflelo_epi16(__mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_pshuflw128_mask( + (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) { + return (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)__A, __imm, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, const int __imm) { + return (__m256i)__builtin_ia32_psrawi256_mask( + (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)__A, __imm, (__v8hi)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psrawi128_mask( + (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) { + return (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)__A, __B, (__v16hi)__W, + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B) { + return (__m256i)__builtin_ia32_psllwi256_mask( + (__v16hi)__A, __B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) { + return (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)__A, __B, (__v8hi)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_slli_epi16(__mmask8 __U, __m128i __A, int __B) { + return (__m128i)__builtin_ia32_psllwi128_mask( + (__v8hi)__A, __B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +#else +#define _mm256_mask_alignr_epi8(W, U, X, Y, N) \ + ((__m256i)__builtin_ia32_palignr256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(N * 8), \ + (__v4di)(__m256i)(X), (__mmask32)(U))) + +#define _mm256_mask_srli_epi16(W, U, A, B) \ + ((__m256i)__builtin_ia32_psrlwi256_mask( \ + (__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) + +#define _mm256_maskz_srli_epi16(U, A, B) \ + ((__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)(__m256i)(A), (int)(B), \ + (__v16hi)_mm256_setzero_si256(), \ + (__mmask16)(U))) + +#define _mm_mask_srli_epi16(W, U, A, B) \ + ((__m128i)__builtin_ia32_psrlwi128_mask( \ + (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srli_epi16(U, A, B) \ + ((__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)(__m128i)(A), (int)(B), \ + (__v8hi)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_mask_srai_epi16(W, U, A, B) \ + ((__m256i)__builtin_ia32_psrawi256_mask( \ + (__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) + +#define _mm256_maskz_srai_epi16(U, A, B) \ + ((__m256i)__builtin_ia32_psrawi256_mask((__v16hi)(__m256i)(A), (int)(B), \ + (__v16hi)_mm256_setzero_si256(), \ + (__mmask16)(U))) + +#define _mm_mask_srai_epi16(W, U, A, B) \ + ((__m128i)__builtin_ia32_psrawi128_mask( \ + (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srai_epi16(U, A, B) \ + ((__m128i)__builtin_ia32_psrawi128_mask((__v8hi)(__m128i)(A), (int)(B), \ + (__v8hi)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_mask_shufflehi_epi16(W, U, A, B) \ + ((__m256i)__builtin_ia32_pshufhw256_mask( \ + (__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) + +#define _mm256_maskz_shufflehi_epi16(U, A, B) \ + ((__m256i)__builtin_ia32_pshufhw256_mask( \ + (__v16hi)(__m256i)(A), (int)(B), \ + (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)(U))) + +#define _mm_mask_shufflehi_epi16(W, U, A, B) \ + ((__m128i)__builtin_ia32_pshufhw128_mask( \ + (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_shufflehi_epi16(U, A, B) \ + ((__m128i)__builtin_ia32_pshufhw128_mask( \ + (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_mask_shufflelo_epi16(W, U, A, B) \ + ((__m256i)__builtin_ia32_pshuflw256_mask( \ + (__v16hi)(__m256i)(A), (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) + +#define _mm256_maskz_shufflelo_epi16(U, A, B) \ + ((__m256i)__builtin_ia32_pshuflw256_mask( \ + (__v16hi)(__m256i)(A), (int)(B), \ + (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)(U))) + +#define _mm_mask_shufflelo_epi16(W, U, A, B) \ + ((__m128i)__builtin_ia32_pshuflw128_mask( \ + (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_shufflelo_epi16(U, A, B) \ + ((__m128i)__builtin_ia32_pshuflw128_mask( \ + (__v8hi)(__m128i)(A), (int)(B), (__v8hi)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_maskz_alignr_epi8(U, X, Y, N) \ + ((__m256i)__builtin_ia32_palignr256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(N * 8), \ + (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask32)(U))) + +#define _mm_mask_alignr_epi8(W, U, X, Y, N) \ + ((__m128i)__builtin_ia32_palignr128_mask( \ + (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(N * 8), \ + (__v2di)(__m128i)(X), (__mmask16)(U))) + +#define _mm_maskz_alignr_epi8(U, X, Y, N) \ + ((__m128i)__builtin_ia32_palignr128_mask( \ + (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(N * 8), \ + (__v2di)(__m128i)_mm_setzero_si128(), (__mmask16)(U))) + +#define _mm_mask_slli_epi16(W, U, X, C) \ + ((__m128i)__builtin_ia32_psllwi128_mask( \ + (__v8hi)(__m128i)(X), (int)(C), (__v8hi)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_slli_epi16(U, X, C) \ + ((__m128i)__builtin_ia32_psllwi128_mask( \ + (__v8hi)(__m128i)(X), (int)(C), (__v8hi)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_dbsad_epu8(X, Y, C) \ + ((__m256i)__builtin_ia32_dbpsadbw256_mask( \ + (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(C), \ + (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)-1)) + +#define _mm256_mask_slli_epi16(W, U, X, C) \ + ((__m256i)__builtin_ia32_psllwi256_mask( \ + (__v16hi)(__m256i)(X), (int)(C), (__v16hi)(__m256i)(W), (__mmask16)(U))) + +#define _mm256_maskz_slli_epi16(U, X, C) \ + ((__m256i)__builtin_ia32_psllwi256_mask( \ + (__v16hi)(__m256i)(X), (int)(C), \ + (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)(U))) + +#define _mm256_mask_dbsad_epu8(W, U, X, Y, C) \ + ((__m256i)__builtin_ia32_dbpsadbw256_mask( \ + (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(C), \ + (__v16hi)(__m256i)(W), (__mmask16)(U))) + +#define _mm256_maskz_dbsad_epu8(U, X, Y, C) \ + ((__m256i)__builtin_ia32_dbpsadbw256_mask( \ + (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(C), \ + (__v16hi)(__m256i)_mm256_setzero_si256(), (__mmask16)(U))) + +#define _mm_dbsad_epu8(X, Y, C) \ + ((__m128i)__builtin_ia32_dbpsadbw128_mask( \ + (__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(C), \ + (__v8hi)(__m128i)_mm_setzero_si128(), (__mmask8)-1)) + +#define _mm_mask_dbsad_epu8(W, U, X, Y, C) \ + ((__m128i)__builtin_ia32_dbpsadbw128_mask( \ + (__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(C), \ + (__v8hi)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_dbsad_epu8(U, X, Y, C) \ + ((__m128i)__builtin_ia32_dbpsadbw128_mask( \ + (__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(C), \ + (__v8hi)(__m128i)_mm_setzero_si128(), (__mmask8)(U))) + +#define _mm_mask_blend_epi16(__U, __A, __W) \ + ((__m128i)__builtin_ia32_blendmw_128_mask((__v8hi)(__A), (__v8hi)(__W), \ + (__mmask8)(__U))) + +#define _mm_mask_blend_epi8(__U, __A, __W) \ + ((__m128i)__builtin_ia32_blendmb_128_mask((__v16qi)(__A), (__v16qi)(__W), \ + (__mmask16)(__U))) + +#define _mm256_mask_blend_epi16(__U, __A, __W) \ + ((__m256i)__builtin_ia32_blendmw_256_mask((__v16hi)(__A), (__v16hi)(__W), \ + (__mmask16)(__U))) + +#define _mm256_mask_blend_epi8(__U, __A, __W) \ + ((__m256i)__builtin_ia32_blendmb_256_mask((__v32qi)(__A), (__v32qi)(__W), \ + (__mmask32)(__U))) + +#define _mm_cmp_epi16_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpw128_mask( \ + (__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(-1))) + +#define _mm_cmp_epi8_mask(X, Y, P) \ + ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(P), \ + (__mmask16)(-1))) + +#define _mm256_cmp_epi16_mask(X, Y, P) \ + ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(X), \ + (__v16hi)(__m256i)(Y), (int)(P), \ + (__mmask16)(-1))) + +#define _mm256_cmp_epi8_mask(X, Y, P) \ + ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(P), \ + (__mmask32)(-1))) + +#define _mm_cmp_epu16_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpw128_mask( \ + (__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(-1))) + +#define _mm_cmp_epu8_mask(X, Y, P) \ + ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(P), \ + (__mmask16)(-1))) + +#define _mm256_cmp_epu16_mask(X, Y, P) \ + ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(X), \ + (__v16hi)(__m256i)(Y), (int)(P), \ + (__mmask16)(-1))) + +#define _mm256_cmp_epu8_mask(X, Y, P) \ + ((__mmask32)__builtin_ia32_ucmpb256_mask( \ + (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)-1)) + +#define _mm_mask_cmp_epi16_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpw128_mask( \ + (__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(M))) + +#define _mm_mask_cmp_epi8_mask(M, X, Y, P) \ + ((__mmask16)__builtin_ia32_cmpb128_mask( \ + (__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(M))) + +#define _mm256_mask_cmp_epi16_mask(M, X, Y, P) \ + ((__mmask16)__builtin_ia32_cmpw256_mask( \ + (__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(M))) + +#define _mm256_mask_cmp_epi8_mask(M, X, Y, P) \ + ((__mmask32)__builtin_ia32_cmpb256_mask( \ + (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)(M))) + +#define _mm_mask_cmp_epu16_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpw128_mask( \ + (__v8hi)(__m128i)(X), (__v8hi)(__m128i)(Y), (int)(P), (__mmask8)(M))) + +#define _mm_mask_cmp_epu8_mask(M, X, Y, P) \ + ((__mmask16)__builtin_ia32_ucmpb128_mask( \ + (__v16qi)(__m128i)(X), (__v16qi)(__m128i)(Y), (int)(P), (__mmask16)(M))) + +#define _mm256_mask_cmp_epu16_mask(M, X, Y, P) \ + ((__mmask16)__builtin_ia32_ucmpw256_mask( \ + (__v16hi)(__m256i)(X), (__v16hi)(__m256i)(Y), (int)(P), (__mmask16)(M))) + +#define _mm256_mask_cmp_epu8_mask(M, X, Y, P) \ + ((__mmask32)__builtin_ia32_ucmpb256_mask( \ + (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)M)) +#endif + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpneq_epi8_mask(__m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmplt_epi8_mask(__m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpge_epi8_mask(__m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmple_epi8_mask(__m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, + (__mmask32)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpneq_epi16_mask(__m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmplt_epi16_mask(__m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpge_epi16_mask(__m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmple_epi16_mask(__m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_epu8_mask(__m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epu8_mask(__m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_epu8_mask(__m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_epu8_mask(__m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, + (__mmask16)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_epu16_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epu16_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_epu16_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_epu16_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, + (__mmask8)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_epi8_mask(__m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epi8_mask(__m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_epi8_mask(__m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_epi8_mask(__m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, + (__mmask16)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_epi16_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epi16_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_epi16_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_epi16_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, + (__mmask8)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pmulhrsw256_mask((__v16hi)__X, (__v16hi)__Y, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pmulhrsw256_mask( + (__v16hi)__X, (__v16hi)__Y, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmulhuw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmulhuw256_mask( + (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmulhw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmulhw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmulhw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmulhw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmulhuw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmulhuw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmulhrsw128_mask((__v8hi)__X, (__v8hi)__Y, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmulhrsw128_mask( + (__v8hi)__X, (__v8hi)__Y, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmullw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmullw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmullw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmullw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovsxbw256_mask((__v16qi)__A, (__v16hi)__W, + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovsxbw256_mask( + (__v16qi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsxbw128_mask((__v16qi)__A, (__v8hi)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsxbw128_mask( + (__v16qi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovzxbw256_mask((__v16qi)__A, (__v16hi)__W, + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovzxbw256_mask( + (__v16qi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovzxbw128_mask((__v16qi)__A, (__v8hi)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovzxbw128_mask( + (__v16qi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pavgb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pavgb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pavgb128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pavgb128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pavgw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pavgw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pavgw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pavgw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddsb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddsb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddsw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddsw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddusb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddusb256_mask( + (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddusw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddusw256_mask( + (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubsb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubsb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubsw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubsw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubusb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubusb256_mask( + (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubusw256_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubusw256_mask( + (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddb128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddb128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_punpckhbw256_mask( + (__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpckhbw256_mask( + (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhbw128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhbw128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_punpckhwd256_mask( + (__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpckhwd256_mask( + (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhwd128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhwd128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_punpcklbw256_mask( + (__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpcklbw256_mask( + (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpcklbw128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpcklbw128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_punpcklwd256_mask( + (__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpcklwd256_mask( + (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpcklwd128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpcklwd128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi8_mask(__m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__A, (__v16qi)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epu8_mask(__m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 0, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpeq_epu8_mask(__mmask16 __U, __m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 0, + __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpeq_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__A, (__v16qi)__B, + __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epu8_mask(__m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 0, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epi8_mask(__m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__A, (__v32qi)__B, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpeq_epu8_mask(__mmask32 __U, __m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 0, + __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpeq_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__A, (__v32qi)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epu16_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 0, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi16_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__A, (__v8hi)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpeq_epu16_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 0, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpeq_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__A, (__v8hi)__B, + __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epu16_mask(__m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 0, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epi16_mask(__m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__A, (__v16hi)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpeq_epu16_mask(__mmask16 __U, __m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 0, + __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpeq_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__A, (__v16hi)__B, + __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epu8_mask(__m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 6, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi8_mask(__m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__A, (__v16qi)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpgt_epu8_mask(__mmask16 __U, __m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 6, + __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpgt_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__A, (__v16qi)__B, + __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epu8_mask(__m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 6, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epi8_mask(__m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__A, (__v32qi)__B, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpgt_epu8_mask(__mmask32 __U, __m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 6, + __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpgt_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__A, (__v32qi)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epu16_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 6, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi16_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__A, (__v8hi)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpgt_epu16_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 6, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpgt_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__A, (__v8hi)__B, + __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epu16_mask(__m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 6, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epi16_mask(__m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__A, (__v16hi)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpgt_epu16_mask(__mmask16 __U, __m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 6, + __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpgt_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__A, (__v16hi)__B, + __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testn_epi8_mask(__m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_ptestnmb128((__v16qi)__A, (__v16qi)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { + return (__mmask16)__builtin_ia32_ptestnmb128((__v16qi)__A, (__v16qi)__B, __U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testn_epi8_mask(__m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_ptestnmb256((__v32qi)__A, (__v32qi)__B, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_testn_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { + return (__mmask32)__builtin_ia32_ptestnmb256((__v32qi)__A, (__v32qi)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testn_epi16_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestnmw128((__v8hi)__A, (__v8hi)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_testn_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestnmw128((__v8hi)__A, (__v8hi)__B, __U); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testn_epi16_mask(__m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_ptestnmw256((__v16hi)__A, (__v16hi)__B, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_testn_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) { + return (__mmask16)__builtin_ia32_ptestnmw256((__v16hi)__A, (__v16hi)__B, __U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pshufb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)__W, (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pshufb256_mask((__v32qi)__A, (__v32qi)__B, + (__v32qi)_mm256_setzero_si256(), + (__mmask32)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pshufb128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pshufb128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packsswb256_mask( + (__v16hi)__A, (__v16hi)__B, (__v32qi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packsswb256_mask((__v16hi)__A, (__v16hi)__B, + (__v32qi)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packsswb128_mask( + (__v8hi)__A, (__v8hi)__B, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packsswb128_mask((__v8hi)__A, (__v8hi)__B, + (__v16qi)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packuswb256_mask( + (__v16hi)__A, (__v16hi)__B, (__v32qi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packuswb256_mask((__v16hi)__A, (__v16hi)__B, + (__v32qi)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packuswb128_mask( + (__v8hi)__A, (__v8hi)__B, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packuswb128_mask((__v8hi)__A, (__v8hi)__B, + (__v16qi)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) { + return (__m256i)__builtin_ia32_pabsb256_mask((__v32qi)__A, (__v32qi)__W, + (__mmask32)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_abs_epi8(__mmask32 __U, __m256i __A) { + return (__m256i)__builtin_ia32_pabsb256_mask( + (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pabsb128_mask((__v16qi)__A, (__v16qi)__W, + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pabsb128_mask( + (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) { + return (__m256i)__builtin_ia32_pabsw256_mask((__v16hi)__A, (__v16hi)__W, + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { + return (__m256i)__builtin_ia32_pabsw256_mask( + (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pabsw128_mask((__v8hi)__A, (__v8hi)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pabsw128_mask( + (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpneq_epu8_mask(__m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmplt_epu8_mask(__m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpge_epu8_mask(__m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, + (__mmask32)-1); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmple_epu8_mask(__m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, + (__mmask32)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpneq_epu16_mask(__m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmplt_epu16_mask(__m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpge_epu16_mask(__m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, + (__mmask16)-1); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmple_epu16_mask(__m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, + (__mmask16)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_storeu_epi16(void *__P, __mmask16 __U, __m256i __A) { + __builtin_ia32_storedquhi256_mask((short *)__P, (__v16hi)__A, (__mmask16)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_storeu_epi16(void *__P, __mmask8 __U, __m128i __A) { + __builtin_ia32_storedquhi128_mask((short *)__P, (__v8hi)__A, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddsw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubsb128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubsb128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubsw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubsw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubusb128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubusb128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubusw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubusw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrlw256_mask((__v16hi)__A, (__v8hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrlw256_mask((__v16hi)__A, (__v8hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrlw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srl_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrlw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psraw256_mask((__v16hi)__A, (__v8hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psraw256_mask((__v16hi)__A, (__v8hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psraw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psraw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddsw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddusb128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddusb128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddusw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddusw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubb128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubb128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddsb128_mask((__v16qi)__A, (__v16qi)__B, + (__v16qi)__W, (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddsb128_mask( + (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi16_epi8(__m128i __A) { + + return (__m128i)__builtin_ia32_pmovwb128_mask( + (__v8hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi16_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovwb128mem_mask((__v8qi *)__P, (__v8hi)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovwb128_mask((__v8hi)__A, (__v16qi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi16_epi8(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovwb128_mask( + (__v8hi)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srav_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psrav16hi_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psrav16hi_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psrav16hi_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srav_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrav8hi_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrav8hi_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrav8hi_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srlv_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psrlv16hi_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psrlv16hi_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psrlv16hi_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srlv_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrlv8hi_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrlv8hi_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrlv8hi_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sllv_epi16(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psllv16hi_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psllv16hi_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psllv16hi_mask((__v16hi)__A, (__v16hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sllv_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psllv8hi_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psllv8hi_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psllv8hi_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psllw128_mask((__v8hi)__A, (__v8hi)__B, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sll_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psllw128_mask( + (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psllw256_mask((__v16hi)__A, (__v8hi)__B, + (__v16hi)__W, (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psllw256_mask((__v16hi)__A, (__v8hi)__B, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packusdw256_mask( + (__v8si)__A, (__v8si)__B, (__v16hi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packusdw256_mask((__v8si)__A, (__v8si)__B, + (__v16hi)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packusdw128_mask( + (__v4si)__A, (__v4si)__B, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packusdw128_mask((__v4si)__A, (__v4si)__B, + (__v8hi)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packssdw256_mask( + (__v8si)__A, (__v8si)__B, (__v16hi)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_packssdw256_mask((__v8si)__A, (__v8si)__B, + (__v16hi)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packssdw128_mask( + (__v4si)__A, (__v4si)__B, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packssdw128_mask((__v4si)__A, (__v4si)__B, + (__v8hi)__W, __M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpneq_epu8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmplt_epu8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpge_epu8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmple_epu8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, + (__mmask16)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpneq_epu16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmplt_epu16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpge_epu16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmple_epu16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpneq_epi8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmplt_epi8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpge_epi8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmple_epi8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { + return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, + (__mmask16)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpneq_epi16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmplt_epi16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpge_epi16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmple_epi16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpneq_epu8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmplt_epu8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpge_epu8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmple_epu8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, + (__mmask32)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpneq_epu16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmplt_epu16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpge_epu16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmple_epu16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, + (__mmask16)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpneq_epi8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmplt_epi8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpge_epi8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, + (__mmask32)__M); +} + +extern __inline __mmask32 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmple_epi8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { + return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, + (__mmask32)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpneq_epi16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmplt_epi16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpge_epi16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, + (__mmask16)__M); +} + +extern __inline __mmask16 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmple_epi16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { + return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, + (__mmask16)__M); +} + +#ifdef __DISABLE_AVX512VLBW__ +#undef __DISABLE_AVX512VLBW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VLBW__ */ + +#endif /* _AVX512VLBWINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512vldqintrin.internal.h b/third_party/intel/avx512vldqintrin.internal.h new file mode 100644 index 000000000..89fe84c15 --- /dev/null +++ b/third_party/intel/avx512vldqintrin.internal.h @@ -0,0 +1,1466 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VLDQINTRIN_H_INCLUDED +#define _AVX512VLDQINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512DQ__) +#pragma GCC push_options +#pragma GCC target("avx512vl,avx512dq") +#define __DISABLE_AVX512VLDQ__ +#endif /* __AVX512VLDQ__ */ + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvttpd_epi64(__m256d __A) { + return (__m256i)__builtin_ia32_cvttpd2qq256_mask( + (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvttpd_epi64(__m256i __W, __mmask8 __U, __m256d __A) { + return (__m256i)__builtin_ia32_cvttpd2qq256_mask((__v4df)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvttpd_epi64(__mmask8 __U, __m256d __A) { + return (__m256i)__builtin_ia32_cvttpd2qq256_mask( + (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttpd_epi64(__m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2qq128_mask( + (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvttpd_epi64(__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2qq128_mask((__v2df)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvttpd_epi64(__mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2qq128_mask( + (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvttpd_epu64(__m256d __A) { + return (__m256i)__builtin_ia32_cvttpd2uqq256_mask( + (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvttpd_epu64(__m256i __W, __mmask8 __U, __m256d __A) { + return (__m256i)__builtin_ia32_cvttpd2uqq256_mask((__v4df)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvttpd_epu64(__mmask8 __U, __m256d __A) { + return (__m256i)__builtin_ia32_cvttpd2uqq256_mask( + (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttpd_epu64(__m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2uqq128_mask( + (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvttpd_epu64(__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2uqq128_mask((__v2df)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvttpd_epu64(__mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2uqq128_mask( + (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtpd_epi64(__m256d __A) { + return (__m256i)__builtin_ia32_cvtpd2qq256_mask( + (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtpd_epi64(__m256i __W, __mmask8 __U, __m256d __A) { + return (__m256i)__builtin_ia32_cvtpd2qq256_mask((__v4df)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtpd_epi64(__mmask8 __U, __m256d __A) { + return (__m256i)__builtin_ia32_cvtpd2qq256_mask( + (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpd_epi64(__m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2qq128_mask( + (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtpd_epi64(__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2qq128_mask((__v2df)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtpd_epi64(__mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2qq128_mask( + (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtpd_epu64(__m256d __A) { + return (__m256i)__builtin_ia32_cvtpd2uqq256_mask( + (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtpd_epu64(__m256i __W, __mmask8 __U, __m256d __A) { + return (__m256i)__builtin_ia32_cvtpd2uqq256_mask((__v4df)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtpd_epu64(__mmask8 __U, __m256d __A) { + return (__m256i)__builtin_ia32_cvtpd2uqq256_mask( + (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpd_epu64(__m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2uqq128_mask( + (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtpd_epu64(__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2uqq128_mask((__v2df)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtpd_epu64(__mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2uqq128_mask( + (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvttps_epi64(__m128 __A) { + return (__m256i)__builtin_ia32_cvttps2qq256_mask( + (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvttps_epi64(__m256i __W, __mmask8 __U, __m128 __A) { + return (__m256i)__builtin_ia32_cvttps2qq256_mask((__v4sf)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvttps_epi64(__mmask8 __U, __m128 __A) { + return (__m256i)__builtin_ia32_cvttps2qq256_mask( + (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttps_epi64(__m128 __A) { + return (__m128i)__builtin_ia32_cvttps2qq128_mask( + (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvttps_epi64(__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvttps2qq128_mask((__v4sf)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvttps_epi64(__mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvttps2qq128_mask( + (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvttps_epu64(__m128 __A) { + return (__m256i)__builtin_ia32_cvttps2uqq256_mask( + (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvttps_epu64(__m256i __W, __mmask8 __U, __m128 __A) { + return (__m256i)__builtin_ia32_cvttps2uqq256_mask((__v4sf)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) { + return (__m256i)__builtin_ia32_cvttps2uqq256_mask( + (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttps_epu64(__m128 __A) { + return (__m128i)__builtin_ia32_cvttps2uqq128_mask( + (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvttps_epu64(__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvttps2uqq128_mask((__v4sf)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvttps2uqq128_mask( + (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcast_f64x2(__m128d __A) { + return (__m256d)__builtin_ia32_broadcastf64x2_256_mask( + (__v2df)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) { + return (__m256d)__builtin_ia32_broadcastf64x2_256_mask((__v2df)__A, + (__v4df)__O, __M); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { + return (__m256d)__builtin_ia32_broadcastf64x2_256_mask( + (__v2df)__A, (__v4df)_mm256_setzero_ps(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcast_i64x2(__m128i __A) { + return (__m256i)__builtin_ia32_broadcasti64x2_256_mask( + (__v2di)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) { + return (__m256i)__builtin_ia32_broadcasti64x2_256_mask((__v2di)__A, + (__v4di)__O, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { + return (__m256i)__builtin_ia32_broadcasti64x2_256_mask( + (__v2di)__A, (__v4di)_mm256_setzero_si256(), __M); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcast_f32x2(__m128 __A) { + return (__m256)__builtin_ia32_broadcastf32x2_256_mask( + (__v4sf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) { + return (__m256)__builtin_ia32_broadcastf32x2_256_mask((__v4sf)__A, + (__v8sf)__O, __M); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) { + return (__m256)__builtin_ia32_broadcastf32x2_256_mask( + (__v4sf)__A, (__v8sf)_mm256_setzero_ps(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcast_i32x2(__m128i __A) { + return (__m256i)__builtin_ia32_broadcasti32x2_256_mask( + (__v4si)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A) { + return (__m256i)__builtin_ia32_broadcasti32x2_256_mask((__v4si)__A, + (__v8si)__O, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { + return (__m256i)__builtin_ia32_broadcasti32x2_256_mask( + (__v4si)__A, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_broadcast_i32x2(__m128i __A) { + return (__m128i)__builtin_ia32_broadcasti32x2_128_mask( + (__v4si)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_broadcasti32x2_128_mask((__v4si)__A, + (__v4si)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_broadcasti32x2_128_mask( + (__v4si)__A, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mullo_epi64(__m256i __A, __m256i __B) { + return (__m256i)((__v4du)__A * (__v4du)__B); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmullq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmullq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mullo_epi64(__m128i __A, __m128i __B) { + return (__m128i)((__v2du)__A * (__v2du)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmullq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmullq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_andnpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_andnpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_andnpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_andnpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_andnps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_andnps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_andnps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_andnps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtps_epi64(__m128 __A) { + return (__m256i)__builtin_ia32_cvtps2qq256_mask( + (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtps_epi64(__m256i __W, __mmask8 __U, __m128 __A) { + return (__m256i)__builtin_ia32_cvtps2qq256_mask((__v4sf)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtps_epi64(__mmask8 __U, __m128 __A) { + return (__m256i)__builtin_ia32_cvtps2qq256_mask( + (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_epi64(__m128 __A) { + return (__m128i)__builtin_ia32_cvtps2qq128_mask( + (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtps_epi64(__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvtps2qq128_mask((__v4sf)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtps_epi64(__mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvtps2qq128_mask( + (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtps_epu64(__m128 __A) { + return (__m256i)__builtin_ia32_cvtps2uqq256_mask( + (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtps_epu64(__m256i __W, __mmask8 __U, __m128 __A) { + return (__m256i)__builtin_ia32_cvtps2uqq256_mask((__v4sf)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) { + return (__m256i)__builtin_ia32_cvtps2uqq256_mask( + (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_epu64(__m128 __A) { + return (__m128i)__builtin_ia32_cvtps2uqq128_mask( + (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtps_epu64(__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvtps2uqq128_mask((__v4sf)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvtps2uqq128_mask( + (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi64_ps(__m256i __A) { + return (__m128)__builtin_ia32_cvtqq2ps256_mask( + (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m256i __A) { + return (__m128)__builtin_ia32_cvtqq2ps256_mask((__v4di)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi64_ps(__mmask8 __U, __m256i __A) { + return (__m128)__builtin_ia32_cvtqq2ps256_mask( + (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi64_ps(__m128i __A) { + return (__m128)__builtin_ia32_cvtqq2ps128_mask( + (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_cvtqq2ps128_mask((__v2di)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi64_ps(__mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_cvtqq2ps128_mask( + (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepu64_ps(__m256i __A) { + return (__m128)__builtin_ia32_cvtuqq2ps256_mask( + (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m256i __A) { + return (__m128)__builtin_ia32_cvtuqq2ps256_mask((__v4di)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepu64_ps(__mmask8 __U, __m256i __A) { + return (__m128)__builtin_ia32_cvtuqq2ps256_mask( + (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu64_ps(__m128i __A) { + return (__m128)__builtin_ia32_cvtuqq2ps128_mask( + (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_cvtuqq2ps128_mask((__v2di)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepu64_ps(__mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_cvtuqq2ps128_mask( + (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi64_pd(__m256i __A) { + return (__m256d)__builtin_ia32_cvtqq2pd256_mask( + (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi64_pd(__m256d __W, __mmask8 __U, __m256i __A) { + return (__m256d)__builtin_ia32_cvtqq2pd256_mask((__v4di)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi64_pd(__mmask8 __U, __m256i __A) { + return (__m256d)__builtin_ia32_cvtqq2pd256_mask( + (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi64_pd(__m128i __A) { + return (__m128d)__builtin_ia32_cvtqq2pd128_mask( + (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi64_pd(__m128d __W, __mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_cvtqq2pd128_mask((__v2di)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi64_pd(__mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_cvtqq2pd128_mask( + (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepu64_pd(__m256i __A) { + return (__m256d)__builtin_ia32_cvtuqq2pd256_mask( + (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepu64_pd(__m256d __W, __mmask8 __U, __m256i __A) { + return (__m256d)__builtin_ia32_cvtuqq2pd256_mask((__v4di)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepu64_pd(__mmask8 __U, __m256i __A) { + return (__m256d)__builtin_ia32_cvtuqq2pd256_mask( + (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_andpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_andpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_andpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_andpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_andps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_andps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_andps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_andps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu64_pd(__m128i __A) { + return (__m128d)__builtin_ia32_cvtuqq2pd128_mask( + (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepu64_pd(__m128d __W, __mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_cvtuqq2pd128_mask((__v2di)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepu64_pd(__mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_cvtuqq2pd128_mask( + (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_xorpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_xorpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_xorpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_xor_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_xorpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_xorps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_xorps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_xorps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_xorps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_orpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_orpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_orpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_orpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_orps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_orps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_orps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_orps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movm_epi32(__mmask8 __A) { + return (__m128i)__builtin_ia32_cvtmask2d128(__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movm_epi32(__mmask8 __A) { + return (__m256i)__builtin_ia32_cvtmask2d256(__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movm_epi64(__mmask8 __A) { + return (__m128i)__builtin_ia32_cvtmask2q128(__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movm_epi64(__mmask8 __A) { + return (__m256i)__builtin_ia32_cvtmask2q256(__A); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movepi32_mask(__m128i __A) { + return (__mmask8)__builtin_ia32_cvtd2mask128((__v4si)__A); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movepi32_mask(__m256i __A) { + return (__mmask8)__builtin_ia32_cvtd2mask256((__v8si)__A); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movepi64_mask(__m128i __A) { + return (__mmask8)__builtin_ia32_cvtq2mask128((__v2di)__A); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movepi64_mask(__m256i __A) { + return (__mmask8)__builtin_ia32_cvtq2mask256((__v4di)__A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extractf64x2_pd(__m256d __A, const int __imm) { + return (__m128d)__builtin_ia32_extractf64x2_256_mask( + (__v4df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A, + const int __imm) { + return (__m128d)__builtin_ia32_extractf64x2_256_mask( + (__v4df)__A, __imm, (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A, const int __imm) { + return (__m128d)__builtin_ia32_extractf64x2_256_mask( + (__v4df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extracti64x2_epi64(__m256i __A, const int __imm) { + return (__m128i)__builtin_ia32_extracti64x2_256_mask( + (__v4di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A, + const int __imm) { + return (__m128i)__builtin_ia32_extracti64x2_256_mask( + (__v4di)__A, __imm, (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A, const int __imm) { + return (__m128i)__builtin_ia32_extracti64x2_256_mask( + (__v4di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_reduce_pd(__m256d __A, int __B) { + return (__m256d)__builtin_ia32_reducepd256_mask( + (__v4df)__A, __B, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_reduce_pd(__m256d __W, __mmask8 __U, __m256d __A, int __B) { + return (__m256d)__builtin_ia32_reducepd256_mask((__v4df)__A, __B, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_reduce_pd(__mmask8 __U, __m256d __A, int __B) { + return (__m256d)__builtin_ia32_reducepd256_mask( + (__v4df)__A, __B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_reduce_pd(__m128d __A, int __B) { + return (__m128d)__builtin_ia32_reducepd128_mask( + (__v2df)__A, __B, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_reduce_pd(__m128d __W, __mmask8 __U, __m128d __A, int __B) { + return (__m128d)__builtin_ia32_reducepd128_mask((__v2df)__A, __B, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_reduce_pd(__mmask8 __U, __m128d __A, int __B) { + return (__m128d)__builtin_ia32_reducepd128_mask( + (__v2df)__A, __B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_reduce_ps(__m256 __A, int __B) { + return (__m256)__builtin_ia32_reduceps256_mask( + (__v8sf)__A, __B, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_reduce_ps(__m256 __W, __mmask8 __U, __m256 __A, int __B) { + return (__m256)__builtin_ia32_reduceps256_mask((__v8sf)__A, __B, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_reduce_ps(__mmask8 __U, __m256 __A, int __B) { + return (__m256)__builtin_ia32_reduceps256_mask( + (__v8sf)__A, __B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_reduce_ps(__m128 __A, int __B) { + return (__m128)__builtin_ia32_reduceps128_mask( + (__v4sf)__A, __B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_reduce_ps(__m128 __W, __mmask8 __U, __m128 __A, int __B) { + return (__m128)__builtin_ia32_reduceps128_mask((__v4sf)__A, __B, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_reduce_ps(__mmask8 __U, __m128 __A, int __B) { + return (__m128)__builtin_ia32_reduceps128_mask( + (__v4sf)__A, __B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_range_pd(__m256d __A, __m256d __B, int __C) { + return (__m256d)__builtin_ia32_rangepd256_mask( + (__v4df)__A, (__v4df)__B, __C, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_range_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B, + int __C) { + return (__m256d)__builtin_ia32_rangepd256_mask((__v4df)__A, (__v4df)__B, __C, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_range_pd(__mmask8 __U, __m256d __A, __m256d __B, int __C) { + return (__m256d)__builtin_ia32_rangepd256_mask((__v4df)__A, (__v4df)__B, __C, + (__v4df)_mm256_setzero_pd(), + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_range_pd(__m128d __A, __m128d __B, int __C) { + return (__m128d)__builtin_ia32_rangepd128_mask( + (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_range_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + int __C) { + return (__m128d)__builtin_ia32_rangepd128_mask((__v2df)__A, (__v2df)__B, __C, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_range_pd(__mmask8 __U, __m128d __A, __m128d __B, int __C) { + return (__m128d)__builtin_ia32_rangepd128_mask( + (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_range_ps(__m256 __A, __m256 __B, int __C) { + return (__m256)__builtin_ia32_rangeps256_mask( + (__v8sf)__A, (__v8sf)__B, __C, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_range_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B, + int __C) { + return (__m256)__builtin_ia32_rangeps256_mask((__v8sf)__A, (__v8sf)__B, __C, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_range_ps(__mmask8 __U, __m256 __A, __m256 __B, int __C) { + return (__m256)__builtin_ia32_rangeps256_mask((__v8sf)__A, (__v8sf)__B, __C, + (__v8sf)_mm256_setzero_ps(), + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_range_ps(__m128 __A, __m128 __B, int __C) { + return (__m128)__builtin_ia32_rangeps128_mask( + (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_range_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C) { + return (__m128)__builtin_ia32_rangeps128_mask((__v4sf)__A, (__v4sf)__B, __C, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_range_ps(__mmask8 __U, __m128 __A, __m128 __B, int __C) { + return (__m128)__builtin_ia32_rangeps128_mask( + (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fpclass_pd_mask(__mmask8 __U, __m256d __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)__A, __imm, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fpclass_pd_mask(__m256d __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)__A, __imm, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fpclass_ps_mask(__mmask8 __U, __m256 __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)__A, __imm, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fpclass_ps_mask(__m256 __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)__A, __imm, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fpclass_pd_mask(__mmask8 __U, __m128d __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)__A, __imm, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fpclass_pd_mask(__m128d __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)__A, __imm, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fpclass_ps_mask(__mmask8 __U, __m128 __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)__A, __imm, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fpclass_ps_mask(__m128 __A, const int __imm) { + return (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)__A, __imm, + (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_inserti64x2(__m256i __A, __m128i __B, const int __imm) { + return (__m256i)__builtin_ia32_inserti64x2_256_mask( + (__v4di)__A, (__v2di)__B, __imm, (__v4di)_mm256_setzero_si256(), + (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_inserti64x2(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B, + const int __imm) { + return (__m256i)__builtin_ia32_inserti64x2_256_mask( + (__v4di)__A, (__v2di)__B, __imm, (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_inserti64x2(__mmask8 __U, __m256i __A, __m128i __B, + const int __imm) { + return (__m256i)__builtin_ia32_inserti64x2_256_mask( + (__v4di)__A, (__v2di)__B, __imm, (__v4di)_mm256_setzero_si256(), + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_insertf64x2(__m256d __A, __m128d __B, const int __imm) { + return (__m256d)__builtin_ia32_insertf64x2_256_mask( + (__v4df)__A, (__v2df)__B, __imm, (__v4df)_mm256_setzero_pd(), + (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_insertf64x2(__m256d __W, __mmask8 __U, __m256d __A, __m128d __B, + const int __imm) { + return (__m256d)__builtin_ia32_insertf64x2_256_mask( + (__v4df)__A, (__v2df)__B, __imm, (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_insertf64x2(__mmask8 __U, __m256d __A, __m128d __B, + const int __imm) { + return (__m256d)__builtin_ia32_insertf64x2_256_mask( + (__v4df)__A, (__v2df)__B, __imm, (__v4df)_mm256_setzero_pd(), + (__mmask8)__U); +} + +#else +#define _mm256_insertf64x2(X, Y, C) \ + ((__m256d)__builtin_ia32_insertf64x2_256_mask( \ + (__v4df)(__m256d)(X), (__v2df)(__m128d)(Y), (int)(C), \ + (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)-1)) + +#define _mm256_mask_insertf64x2(W, U, X, Y, C) \ + ((__m256d)__builtin_ia32_insertf64x2_256_mask( \ + (__v4df)(__m256d)(X), (__v2df)(__m128d)(Y), (int)(C), \ + (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm256_maskz_insertf64x2(U, X, Y, C) \ + ((__m256d)__builtin_ia32_insertf64x2_256_mask( \ + (__v4df)(__m256d)(X), (__v2df)(__m128d)(Y), (int)(C), \ + (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) + +#define _mm256_inserti64x2(X, Y, C) \ + ((__m256i)__builtin_ia32_inserti64x2_256_mask( \ + (__v4di)(__m256i)(X), (__v2di)(__m128i)(Y), (int)(C), \ + (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask8)-1)) + +#define _mm256_mask_inserti64x2(W, U, X, Y, C) \ + ((__m256i)__builtin_ia32_inserti64x2_256_mask( \ + (__v4di)(__m256i)(X), (__v2di)(__m128i)(Y), (int)(C), \ + (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_inserti64x2(U, X, Y, C) \ + ((__m256i)__builtin_ia32_inserti64x2_256_mask( \ + (__v4di)(__m256i)(X), (__v2di)(__m128i)(Y), (int)(C), \ + (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) + +#define _mm256_extractf64x2_pd(X, C) \ + ((__m128d)__builtin_ia32_extractf64x2_256_mask( \ + (__v4df)(__m256d)(X), (int)(C), (__v2df)(__m128d)_mm_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm256_mask_extractf64x2_pd(W, U, X, C) \ + ((__m128d)__builtin_ia32_extractf64x2_256_mask( \ + (__v4df)(__m256d)(X), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm256_maskz_extractf64x2_pd(U, X, C) \ + ((__m128d)__builtin_ia32_extractf64x2_256_mask( \ + (__v4df)(__m256d)(X), (int)(C), (__v2df)(__m128d)_mm_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm256_extracti64x2_epi64(X, C) \ + ((__m128i)__builtin_ia32_extracti64x2_256_mask( \ + (__v4di)(__m256i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128(), \ + (__mmask8)-1)) + +#define _mm256_mask_extracti64x2_epi64(W, U, X, C) \ + ((__m128i)__builtin_ia32_extracti64x2_256_mask( \ + (__v4di)(__m256i)(X), (int)(C), (__v2di)(__m128i)(W), (__mmask8)(U))) + +#define _mm256_maskz_extracti64x2_epi64(U, X, C) \ + ((__m128i)__builtin_ia32_extracti64x2_256_mask( \ + (__v4di)(__m256i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_reduce_pd(A, B) \ + ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm256_mask_reduce_pd(W, U, A, B) \ + ((__m256d)__builtin_ia32_reducepd256_mask( \ + (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm256_maskz_reduce_pd(U, A, B) \ + ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm_reduce_pd(A, B) \ + ((__m128d)__builtin_ia32_reducepd128_mask( \ + (__v2df)(__m128d)(A), (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)-1)) + +#define _mm_mask_reduce_pd(W, U, A, B) \ + ((__m128d)__builtin_ia32_reducepd128_mask( \ + (__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_reduce_pd(U, A, B) \ + ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm256_reduce_ps(A, B) \ + ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm256_mask_reduce_ps(W, U, A, B) \ + ((__m256)__builtin_ia32_reduceps256_mask( \ + (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm256_maskz_reduce_ps(U, A, B) \ + ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm_reduce_ps(A, B) \ + ((__m128)__builtin_ia32_reduceps128_mask( \ + (__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) + +#define _mm_mask_reduce_ps(W, U, A, B) \ + ((__m128)__builtin_ia32_reduceps128_mask( \ + (__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_reduce_ps(U, A, B) \ + ((__m128)__builtin_ia32_reduceps128_mask( \ + (__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) + +#define _mm256_range_pd(A, B, C) \ + ((__m256d)__builtin_ia32_rangepd256_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), (__mmask8)-1)) + +#define _mm256_maskz_range_pd(U, A, B, C) \ + ((__m256d)__builtin_ia32_rangepd256_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), (__mmask8)(U))) + +#define _mm_range_pd(A, B, C) \ + ((__m128d)__builtin_ia32_rangepd128_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)-1)) + +#define _mm256_range_ps(A, B, C) \ + ((__m256)__builtin_ia32_rangeps256_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), (__mmask8)-1)) + +#define _mm256_mask_range_ps(W, U, A, B, C) \ + ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm256_maskz_range_ps(U, A, B, C) \ + ((__m256)__builtin_ia32_rangeps256_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), (__mmask8)(U))) + +#define _mm_range_ps(A, B, C) \ + ((__m128)__builtin_ia32_rangeps128_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) + +#define _mm_mask_range_ps(W, U, A, B, C) \ + ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_range_ps(U, A, B, C) \ + ((__m128)__builtin_ia32_rangeps128_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) + +#define _mm256_mask_range_pd(W, U, A, B, C) \ + ((__m256d)__builtin_ia32_rangepd256_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm_mask_range_pd(W, U, A, B, C) \ + ((__m128d)__builtin_ia32_rangepd128_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_range_pd(U, A, B, C) \ + ((__m128d)__builtin_ia32_rangepd128_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)(U))) + +#define _mm256_mask_fpclass_pd_mask(u, X, C) \ + ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(X), (int)(C), \ + (__mmask8)(u))) + +#define _mm256_mask_fpclass_ps_mask(u, X, C) \ + ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(X), (int)(C), \ + (__mmask8)(u))) + +#define _mm_mask_fpclass_pd_mask(u, X, C) \ + ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(X), (int)(C), \ + (__mmask8)(u))) + +#define _mm_mask_fpclass_ps_mask(u, X, C) \ + ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(X), (int)(C), \ + (__mmask8)(u))) + +#define _mm256_fpclass_pd_mask(X, C) \ + ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(X), (int)(C), \ + (__mmask8)-1)) + +#define _mm256_fpclass_ps_mask(X, C) \ + ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(X), (int)(C), \ + (__mmask8)-1)) + +#define _mm_fpclass_pd_mask(X, C) \ + ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(X), (int)(C), \ + (__mmask8)-1)) + +#define _mm_fpclass_ps_mask(X, C) \ + ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(X), (int)(C), \ + (__mmask8)-1)) + +#endif + +#ifdef __DISABLE_AVX512VLDQ__ +#undef __DISABLE_AVX512VLDQ__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VLDQ__ */ + +#endif /* _AVX512VLDQINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512vlintrin.internal.h b/third_party/intel/avx512vlintrin.internal.h new file mode 100644 index 000000000..c6b4630ac --- /dev/null +++ b/third_party/intel/avx512vlintrin.internal.h @@ -0,0 +1,9980 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VLINTRIN_H_INCLUDED +#define _AVX512VLINTRIN_H_INCLUDED + +#ifndef __AVX512VL__ +#pragma GCC push_options +#pragma GCC target("avx512vl") +#define __DISABLE_AVX512VL__ +#endif /* __AVX512VL__ */ + +typedef unsigned int __mmask32; + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_mov_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_movapd256_mask((__v4df)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mov_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_movapd256_mask( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_movapd128_mask((__v2df)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mov_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_movapd128_mask( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_load_pd(__m256d __W, __mmask8 __U, void const *__P) { + return (__m256d)__builtin_ia32_loadapd256_mask((__v4df *)__P, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_load_pd(__mmask8 __U, void const *__P) { + return (__m256d)__builtin_ia32_loadapd256_mask( + (__v4df *)__P, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_load_pd(__m128d __W, __mmask8 __U, void const *__P) { + return (__m128d)__builtin_ia32_loadapd128_mask((__v2df *)__P, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_load_pd(__mmask8 __U, void const *__P) { + return (__m128d)__builtin_ia32_loadapd128_mask( + (__v2df *)__P, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_store_pd(void *__P, __mmask8 __U, __m256d __A) { + __builtin_ia32_storeapd256_mask((__v4df *)__P, (__v4df)__A, (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_store_pd(void *__P, __mmask8 __U, __m128d __A) { + __builtin_ia32_storeapd128_mask((__v2df *)__P, (__v2df)__A, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_mov_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_movaps256_mask((__v8sf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mov_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_movaps256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mov_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_movaps128_mask((__v4sf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mov_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_movaps128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_load_ps(__m256 __W, __mmask8 __U, void const *__P) { + return (__m256)__builtin_ia32_loadaps256_mask((__v8sf *)__P, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_load_ps(__mmask8 __U, void const *__P) { + return (__m256)__builtin_ia32_loadaps256_mask( + (__v8sf *)__P, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_load_ps(__m128 __W, __mmask8 __U, void const *__P) { + return (__m128)__builtin_ia32_loadaps128_mask((__v4sf *)__P, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_load_ps(__mmask8 __U, void const *__P) { + return (__m128)__builtin_ia32_loadaps128_mask( + (__v4sf *)__P, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_store_ps(void *__P, __mmask8 __U, __m256 __A) { + __builtin_ia32_storeaps256_mask((__v8sf *)__P, (__v8sf)__A, (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_store_ps(void *__P, __mmask8 __U, __m128 __A) { + __builtin_ia32_storeaps128_mask((__v4sf *)__P, (__v4sf)__A, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_mov_epi64(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_movdqa64_256_mask((__v4di)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mov_epi64(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_movdqa64_256_mask( + (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mov_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_movdqa64_128_mask((__v2di)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mov_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_movdqa64_128_mask( + (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_load_epi64(__m256i __W, __mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_movdqa64load256_mask( + (__v4di *)__P, (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_load_epi64(__mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_movdqa64load256_mask( + (__v4di *)__P, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_load_epi64(__m128i __W, __mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_movdqa64load128_mask( + (__v2di *)__P, (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_load_epi64(__mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_movdqa64load128_mask( + (__v2di *)__P, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_store_epi64(void *__P, __mmask8 __U, __m256i __A) { + __builtin_ia32_movdqa64store256_mask((__v4di *)__P, (__v4di)__A, + (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_store_epi64(void *__P, __mmask8 __U, __m128i __A) { + __builtin_ia32_movdqa64store128_mask((__v2di *)__P, (__v2di)__A, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_mov_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_movdqa32_256_mask((__v8si)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mov_epi32(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_movdqa32_256_mask( + (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mov_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_movdqa32_128_mask((__v4si)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mov_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_movdqa32_128_mask( + (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_load_epi32(__m256i __W, __mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_movdqa32load256_mask( + (__v8si *)__P, (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_load_epi32(__mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_movdqa32load256_mask( + (__v8si *)__P, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_load_epi32(__m128i __W, __mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_movdqa32load128_mask( + (__v4si *)__P, (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_load_epi32(__mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_movdqa32load128_mask( + (__v4si *)__P, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_store_epi32(void *__P, __mmask8 __U, __m256i __A) { + __builtin_ia32_movdqa32store256_mask((__v8si *)__P, (__v8si)__A, + (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_store_epi32(void *__P, __mmask8 __U, __m128i __A) { + __builtin_ia32_movdqa32store128_mask((__v4si *)__P, (__v4si)__A, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_addpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_addpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_addpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_addpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_addps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_addps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_addps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_addps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_subpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_subpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_subpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_subpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_subps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_subps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_subps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_subps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_store_epi64(void *__P, __m256i __A) { + *(__m256i *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_epi64(void *__P, __m128i __A) { + *(__m128i *)__P = __A; +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_loadu_pd(__m256d __W, __mmask8 __U, void const *__P) { + return (__m256d)__builtin_ia32_loadupd256_mask((const double *)__P, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_loadu_pd(__mmask8 __U, void const *__P) { + return (__m256d)__builtin_ia32_loadupd256_mask( + (const double *)__P, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_loadu_pd(__m128d __W, __mmask8 __U, void const *__P) { + return (__m128d)__builtin_ia32_loadupd128_mask((const double *)__P, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_loadu_pd(__mmask8 __U, void const *__P) { + return (__m128d)__builtin_ia32_loadupd128_mask( + (const double *)__P, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_storeu_pd(void *__P, __mmask8 __U, __m256d __A) { + __builtin_ia32_storeupd256_mask((double *)__P, (__v4df)__A, (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_storeu_pd(void *__P, __mmask8 __U, __m128d __A) { + __builtin_ia32_storeupd128_mask((double *)__P, (__v2df)__A, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_loadu_ps(__m256 __W, __mmask8 __U, void const *__P) { + return (__m256)__builtin_ia32_loadups256_mask((const float *)__P, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_loadu_ps(__mmask8 __U, void const *__P) { + return (__m256)__builtin_ia32_loadups256_mask( + (const float *)__P, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) { + return (__m128)__builtin_ia32_loadups128_mask((const float *)__P, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_loadu_ps(__mmask8 __U, void const *__P) { + return (__m128)__builtin_ia32_loadups128_mask( + (const float *)__P, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_storeu_ps(void *__P, __mmask8 __U, __m256 __A) { + __builtin_ia32_storeups256_mask((float *)__P, (__v8sf)__A, (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_storeu_ps(void *__P, __mmask8 __U, __m128 __A) { + __builtin_ia32_storeups128_mask((float *)__P, (__v4sf)__A, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_loadu_epi64(__m256i __W, __mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_loaddqudi256_mask((const long long *)__P, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_loadu_epi64(__mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_loaddqudi256_mask( + (const long long *)__P, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_loadu_epi64(__m128i __W, __mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_loaddqudi128_mask((const long long *)__P, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_loadu_epi64(__mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_loaddqudi128_mask( + (const long long *)__P, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_storeu_epi64(void *__P, __mmask8 __U, __m256i __A) { + __builtin_ia32_storedqudi256_mask((long long *)__P, (__v4di)__A, + (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_storeu_epi64(void *__P, __mmask8 __U, __m128i __A) { + __builtin_ia32_storedqudi128_mask((long long *)__P, (__v2di)__A, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_loadu_epi32(__m256i __W, __mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_loaddqusi256_mask((const int *)__P, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_loadu_epi32(__mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_loaddqusi256_mask( + (const int *)__P, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_loadu_epi32(__m128i __W, __mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_loaddqusi128_mask((const int *)__P, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_loadu_epi32(__mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_loaddqusi128_mask( + (const int *)__P, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_storeu_epi32(void *__P, __mmask8 __U, __m256i __A) { + __builtin_ia32_storedqusi256_mask((int *)__P, (__v8si)__A, (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_storeu_epi32(void *__P, __mmask8 __U, __m128i __A) { + __builtin_ia32_storedqusi128_mask((int *)__P, (__v4si)__A, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_pabsd256_mask((__v8si)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_pabsd256_mask( + (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pabsd128_mask((__v4si)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pabsd128_mask( + (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_abs_epi64(__m256i __A) { + return (__m256i)__builtin_ia32_pabsq256_mask( + (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_abs_epi64(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_pabsq256_mask((__v4di)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_abs_epi64(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_pabsq256_mask( + (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_epi64(__m128i __A) { + return (__m128i)__builtin_ia32_pabsq128_mask( + (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_abs_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pabsq128_mask((__v2di)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_abs_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pabsq128_mask( + (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtpd_epu32(__m256d __A) { + return (__m128i)__builtin_ia32_cvtpd2udq256_mask( + (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtpd_epu32(__m128i __W, __mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_cvtpd2udq256_mask((__v4df)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtpd_epu32(__mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_cvtpd2udq256_mask( + (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpd_epu32(__m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2udq128_mask( + (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtpd_epu32(__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2udq128_mask((__v2df)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtpd_epu32(__mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2udq128_mask( + (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvttps_epi32(__m256i __W, __mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_cvttps2dq256_mask((__v8sf)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvttps_epi32(__mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_cvttps2dq256_mask( + (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvttps_epi32(__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvttps2dq128_mask((__v4sf)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvttps_epi32(__mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvttps2dq128_mask( + (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvttps_epu32(__m256 __A) { + return (__m256i)__builtin_ia32_cvttps2udq256_mask( + (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvttps_epu32(__m256i __W, __mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_cvttps2udq256_mask((__v8sf)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvttps_epu32(__mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_cvttps2udq256_mask( + (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttps_epu32(__m128 __A) { + return (__m128i)__builtin_ia32_cvttps2udq128_mask( + (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvttps_epu32(__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvttps2udq128_mask((__v4sf)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvttps_epu32(__mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvttps2udq128_mask( + (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvttpd_epi32(__m128i __W, __mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_cvttpd2dq256_mask((__v4df)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvttpd_epi32(__mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_cvttpd2dq256_mask( + (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvttpd_epi32(__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2dq128_mask((__v2df)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvttpd_epi32(__mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2dq128_mask( + (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvttpd_epu32(__m256d __A) { + return (__m128i)__builtin_ia32_cvttpd2udq256_mask( + (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvttpd_epu32(__m128i __W, __mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_cvttpd2udq256_mask((__v4df)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvttpd_epu32(__mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_cvttpd2udq256_mask( + (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttpd_epu32(__m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2udq128_mask( + (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvttpd_epu32(__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2udq128_mask((__v2df)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvttpd_epu32(__mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2udq128_mask( + (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtpd_epi32(__m128i __W, __mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_cvtpd2dq256_mask((__v4df)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtpd_epi32(__mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_cvtpd2dq256_mask( + (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtpd_epi32(__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2dq128_mask((__v2df)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtpd_epi32(__mmask8 __U, __m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2dq128_mask( + (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi32_pd(__m256d __W, __mmask8 __U, __m128i __A) { + return (__m256d)__builtin_ia32_cvtdq2pd256_mask((__v4si)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) { + return (__m256d)__builtin_ia32_cvtdq2pd256_mask( + (__v4si)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi32_pd(__m128d __W, __mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_cvtdq2pd128_mask((__v4si)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_cvtdq2pd128_mask( + (__v4si)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepu32_pd(__m128i __A) { + return (__m256d)__builtin_ia32_cvtudq2pd256_mask( + (__v4si)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepu32_pd(__m256d __W, __mmask8 __U, __m128i __A) { + return (__m256d)__builtin_ia32_cvtudq2pd256_mask((__v4si)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) { + return (__m256d)__builtin_ia32_cvtudq2pd256_mask( + (__v4si)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu32_pd(__m128i __A) { + return (__m128d)__builtin_ia32_cvtudq2pd128_mask( + (__v4si)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepu32_pd(__m128d __W, __mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_cvtudq2pd128_mask((__v4si)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_cvtudq2pd128_mask( + (__v4si)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi32_ps(__m256 __W, __mmask8 __U, __m256i __A) { + return (__m256)__builtin_ia32_cvtdq2ps256_mask((__v8si)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi32_ps(__mmask8 __U, __m256i __A) { + return (__m256)__builtin_ia32_cvtdq2ps256_mask( + (__v8si)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi32_ps(__m128 __W, __mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_cvtdq2ps128_mask((__v4si)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi32_ps(__mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_cvtdq2ps128_mask( + (__v4si)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepu32_ps(__m256i __A) { + return (__m256)__builtin_ia32_cvtudq2ps256_mask( + (__v8si)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepu32_ps(__m256 __W, __mmask8 __U, __m256i __A) { + return (__m256)__builtin_ia32_cvtudq2ps256_mask((__v8si)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepu32_ps(__mmask8 __U, __m256i __A) { + return (__m256)__builtin_ia32_cvtudq2ps256_mask( + (__v8si)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu32_ps(__m128i __A) { + return (__m128)__builtin_ia32_cvtudq2ps128_mask( + (__v4si)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepu32_ps(__m128 __W, __mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_cvtudq2ps128_mask((__v4si)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepu32_ps(__mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_cvtudq2ps128_mask( + (__v4si)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtps_pd(__m256d __W, __mmask8 __U, __m128 __A) { + return (__m256d)__builtin_ia32_cvtps2pd256_mask((__v4sf)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtps_pd(__mmask8 __U, __m128 __A) { + return (__m256d)__builtin_ia32_cvtps2pd256_mask( + (__v4sf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtps_pd(__m128d __W, __mmask8 __U, __m128 __A) { + return (__m128d)__builtin_ia32_cvtps2pd128_mask((__v4sf)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtps_pd(__mmask8 __U, __m128 __A) { + return (__m128d)__builtin_ia32_cvtps2pd128_mask( + (__v4sf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi32_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_pmovdb128_mask( + (__v4si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi32_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovdb128mem_mask((__v16qi *)__P, (__v4si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovdb128_mask((__v4si)__A, (__v16qi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi32_epi8(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovdb128_mask( + (__v4si)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi32_epi8(__m256i __A) { + return (__m128i)__builtin_ia32_pmovdb256_mask( + (__v8si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovdb256_mask((__v8si)__A, (__v16qi)__O, __M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi32_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovdb256mem_mask((__v16qi *)__P, (__v8si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi32_epi8(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovdb256_mask( + (__v8si)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsepi32_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_pmovsdb128_mask( + (__v4si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi32_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovsdb128mem_mask((__v16qi *)__P, (__v4si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsdb128_mask((__v4si)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtsepi32_epi8(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsdb128_mask( + (__v4si)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtsepi32_epi8(__m256i __A) { + return (__m128i)__builtin_ia32_pmovsdb256_mask( + (__v8si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi32_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovsdb256mem_mask((__v16qi *)__P, (__v8si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovsdb256_mask((__v8si)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtsepi32_epi8(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovsdb256_mask( + (__v8si)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtusepi32_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_pmovusdb128_mask( + (__v4si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi32_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovusdb128mem_mask((__v16qi *)__P, (__v4si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovusdb128_mask((__v4si)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtusepi32_epi8(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovusdb128_mask( + (__v4si)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtusepi32_epi8(__m256i __A) { + return (__m128i)__builtin_ia32_pmovusdb256_mask( + (__v8si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi32_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovusdb256mem_mask((__v16qi *)__P, (__v8si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovusdb256_mask((__v8si)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtusepi32_epi8(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovusdb256_mask( + (__v8si)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi32_epi16(__m128i __A) { + return (__m128i)__builtin_ia32_pmovdw128_mask( + (__v4si)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi32_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovdw128mem_mask((__v8hi *)__P, (__v4si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovdw128_mask((__v4si)__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi32_epi16(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovdw128_mask( + (__v4si)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi32_epi16(__m256i __A) { + return (__m128i)__builtin_ia32_pmovdw256_mask( + (__v8si)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi32_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovdw256mem_mask((__v8hi *)__P, (__v8si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovdw256_mask((__v8si)__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi32_epi16(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovdw256_mask( + (__v8si)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsepi32_epi16(__m128i __A) { + return (__m128i)__builtin_ia32_pmovsdw128_mask( + (__v4si)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi32_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovsdw128mem_mask((__v8hi *)__P, (__v4si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsdw128_mask((__v4si)__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtsepi32_epi16(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsdw128_mask( + (__v4si)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtsepi32_epi16(__m256i __A) { + return (__m128i)__builtin_ia32_pmovsdw256_mask( + (__v8si)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi32_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovsdw256mem_mask((__v8hi *)__P, (__v8si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovsdw256_mask((__v8si)__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtsepi32_epi16(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovsdw256_mask( + (__v8si)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtusepi32_epi16(__m128i __A) { + return (__m128i)__builtin_ia32_pmovusdw128_mask( + (__v4si)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi32_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovusdw128mem_mask((__v8hi *)__P, (__v4si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovusdw128_mask((__v4si)__A, (__v8hi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtusepi32_epi16(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovusdw128_mask( + (__v4si)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtusepi32_epi16(__m256i __A) { + return (__m128i)__builtin_ia32_pmovusdw256_mask( + (__v8si)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi32_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovusdw256mem_mask((__v8hi *)__P, (__v8si)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovusdw256_mask((__v8si)__A, (__v8hi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtusepi32_epi16(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovusdw256_mask( + (__v8si)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi64_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_pmovqb128_mask( + (__v2di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovqb128mem_mask((__v16qi *)__P, (__v2di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovqb128_mask((__v2di)__A, (__v16qi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi64_epi8(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovqb128_mask( + (__v2di)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi64_epi8(__m256i __A) { + return (__m128i)__builtin_ia32_pmovqb256_mask( + (__v4di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovqb256mem_mask((__v16qi *)__P, (__v4di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovqb256_mask((__v4di)__A, (__v16qi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi64_epi8(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovqb256_mask( + (__v4di)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsepi64_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_pmovsqb128_mask( + (__v2di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovsqb128mem_mask((__v16qi *)__P, (__v2di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsqb128_mask((__v2di)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtsepi64_epi8(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsqb128_mask( + (__v2di)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtsepi64_epi8(__m256i __A) { + return (__m128i)__builtin_ia32_pmovsqb256_mask( + (__v4di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovsqb256mem_mask((__v16qi *)__P, (__v4di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovsqb256_mask((__v4di)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtsepi64_epi8(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovsqb256_mask( + (__v4di)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtusepi64_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_pmovusqb128_mask( + (__v2di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovusqb128mem_mask((__v16qi *)__P, (__v2di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovusqb128_mask((__v2di)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtusepi64_epi8(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovusqb128_mask( + (__v2di)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtusepi64_epi8(__m256i __A) { + return (__m128i)__builtin_ia32_pmovusqb256_mask( + (__v4di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovusqb256mem_mask((__v16qi *)__P, (__v4di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovusqb256_mask((__v4di)__A, (__v16qi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtusepi64_epi8(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovusqb256_mask( + (__v4di)__A, (__v16qi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi64_epi16(__m128i __A) { + return (__m128i)__builtin_ia32_pmovqw128_mask( + (__v2di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovqw128mem_mask((__v8hi *)__P, (__v2di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovqw128_mask((__v2di)__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi64_epi16(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovqw128_mask( + (__v2di)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi64_epi16(__m256i __A) { + return (__m128i)__builtin_ia32_pmovqw256_mask( + (__v4di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovqw256mem_mask((__v8hi *)__P, (__v4di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovqw256_mask((__v4di)__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi64_epi16(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovqw256_mask( + (__v4di)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsepi64_epi16(__m128i __A) { + return (__m128i)__builtin_ia32_pmovsqw128_mask( + (__v2di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovsqw128mem_mask((__v8hi *)__P, (__v2di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsqw128_mask((__v2di)__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtsepi64_epi16(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsqw128_mask( + (__v2di)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtsepi64_epi16(__m256i __A) { + return (__m128i)__builtin_ia32_pmovsqw256_mask( + (__v4di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovsqw256mem_mask((__v8hi *)__P, (__v4di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovsqw256_mask((__v4di)__A, (__v8hi)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtsepi64_epi16(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovsqw256_mask( + (__v4di)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtusepi64_epi16(__m128i __A) { + return (__m128i)__builtin_ia32_pmovusqw128_mask( + (__v2di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovusqw128mem_mask((__v8hi *)__P, (__v2di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovusqw128_mask((__v2di)__A, (__v8hi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtusepi64_epi16(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovusqw128_mask( + (__v2di)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtusepi64_epi16(__m256i __A) { + return (__m128i)__builtin_ia32_pmovusqw256_mask( + (__v4di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovusqw256mem_mask((__v8hi *)__P, (__v4di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovusqw256_mask((__v4di)__A, (__v8hi)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtusepi64_epi16(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovusqw256_mask( + (__v4di)__A, (__v8hi)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi64_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_pmovqd128_mask( + (__v2di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovqd128mem_mask((__v4si *)__P, (__v2di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovqd128_mask((__v2di)__A, (__v4si)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi64_epi32(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovqd128_mask( + (__v2di)__A, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi64_epi32(__m256i __A) { + return (__m128i)__builtin_ia32_pmovqd256_mask( + (__v4di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovqd256mem_mask((__v4si *)__P, (__v4di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovqd256_mask((__v4di)__A, (__v4si)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovqd256_mask( + (__v4di)__A, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsepi64_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_pmovsqd128_mask( + (__v2di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovsqd128mem_mask((__v4si *)__P, (__v2di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsqd128_mask((__v2di)__A, (__v4si)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtsepi64_epi32(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsqd128_mask( + (__v2di)__A, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtsepi64_epi32(__m256i __A) { + return (__m128i)__builtin_ia32_pmovsqd256_mask( + (__v4di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovsqd256mem_mask((__v4si *)__P, (__v4di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovsqd256_mask((__v4di)__A, (__v4si)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtsepi64_epi32(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovsqd256_mask( + (__v4di)__A, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtusepi64_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_pmovusqd128_mask( + (__v2di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, __m128i __A) { + __builtin_ia32_pmovusqd128mem_mask((__v4si *)__P, (__v2di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovusqd128_mask((__v2di)__A, (__v4si)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtusepi64_epi32(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pmovusqd128_mask( + (__v2di)__A, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtusepi64_epi32(__m256i __A) { + return (__m128i)__builtin_ia32_pmovusqd256_mask( + (__v4di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, __m256i __A) { + __builtin_ia32_pmovusqd256mem_mask((__v4si *)__P, (__v4di)__A, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovusqd256_mask((__v4di)__A, (__v4si)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtusepi64_epi32(__mmask8 __M, __m256i __A) { + return (__m128i)__builtin_ia32_pmovusqd256_mask( + (__v4di)__A, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcastss_ps(__m256 __O, __mmask8 __M, __m128 __A) { + return (__m256)__builtin_ia32_broadcastss256_mask((__v4sf)__A, (__v8sf)__O, + __M); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { + return (__m256)__builtin_ia32_broadcastss256_mask( + (__v4sf)__A, (__v8sf)_mm256_setzero_ps(), __M); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_broadcastss_ps(__m128 __O, __mmask8 __M, __m128 __A) { + return (__m128)__builtin_ia32_broadcastss128_mask((__v4sf)__A, (__v4sf)__O, + __M); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { + return (__m128)__builtin_ia32_broadcastss128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), __M); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) { + return (__m256d)__builtin_ia32_broadcastsd256_mask((__v2df)__A, (__v4df)__O, + __M); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) { + return (__m256d)__builtin_ia32_broadcastsd256_mask( + (__v2df)__A, (__v4df)_mm256_setzero_pd(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcastd_epi32(__m256i __O, __mmask8 __M, __m128i __A) { + return (__m256i)__builtin_ia32_pbroadcastd256_mask((__v4si)__A, (__v8si)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) { + return (__m256i)__builtin_ia32_pbroadcastd256_mask( + (__v4si)__A, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) { + return (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask(__A, (__v8si)__O, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_set1_epi32(__mmask8 __M, int __A) { + return (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask( + __A, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_broadcastd_epi32(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pbroadcastd128_mask((__v4si)__A, (__v4si)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pbroadcastd128_mask( + (__v4si)__A, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) { + return (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask(__A, (__v4si)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_set1_epi32(__mmask8 __M, int __A) { + return (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask( + __A, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcastq_epi64(__m256i __O, __mmask8 __M, __m128i __A) { + return (__m256i)__builtin_ia32_pbroadcastq256_mask((__v2di)__A, (__v4di)__O, + __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { + return (__m256i)__builtin_ia32_pbroadcastq256_mask( + (__v2di)__A, (__v4di)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) { + return (__m256i)__builtin_ia32_pbroadcastq256_gpr_mask(__A, (__v4di)__O, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_set1_epi64(__mmask8 __M, long long __A) { + return (__m256i)__builtin_ia32_pbroadcastq256_gpr_mask( + __A, (__v4di)_mm256_setzero_si256(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_broadcastq_epi64(__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pbroadcastq128_mask((__v2di)__A, (__v2di)__O, + __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { + return (__m128i)__builtin_ia32_pbroadcastq128_mask( + (__v2di)__A, (__v2di)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { + return (__m128i)__builtin_ia32_pbroadcastq128_gpr_mask(__A, (__v2di)__O, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_set1_epi64(__mmask8 __M, long long __A) { + return (__m128i)__builtin_ia32_pbroadcastq128_gpr_mask( + __A, (__v2di)_mm_setzero_si128(), __M); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcast_f32x4(__m128 __A) { + return (__m256)__builtin_ia32_broadcastf32x4_256_mask( + (__v4sf)__A, (__v8sf)_mm256_undefined_pd(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) { + return (__m256)__builtin_ia32_broadcastf32x4_256_mask((__v4sf)__A, + (__v8sf)__O, __M); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) { + return (__m256)__builtin_ia32_broadcastf32x4_256_mask( + (__v4sf)__A, (__v8sf)_mm256_setzero_ps(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcast_i32x4(__m128i __A) { + return (__m256i)__builtin_ia32_broadcasti32x4_256_mask( + (__v4si)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) { + return (__m256i)__builtin_ia32_broadcasti32x4_256_mask((__v4si)__A, + (__v8si)__O, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) { + return (__m256i)__builtin_ia32_broadcasti32x4_256_mask( + (__v4si)__A, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi8_epi32(__m256i __W, __mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovsxbd256_mask((__v16qi)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovsxbd256_mask( + (__v16qi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsxbd128_mask((__v16qi)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsxbd128_mask( + (__v16qi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovsxbq256_mask((__v16qi)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovsxbq256_mask( + (__v16qi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsxbq128_mask((__v16qi)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsxbq128_mask( + (__v16qi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovsxwd256_mask((__v8hi)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovsxwd256_mask( + (__v8hi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsxwd128_mask((__v8hi)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsxwd128_mask( + (__v8hi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovsxwq256_mask((__v8hi)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovsxwq256_mask( + (__v8hi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsxwq128_mask((__v8hi)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovsxwq128_mask( + (__v8hi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) { + return (__m256i)__builtin_ia32_pmovsxdq256_mask((__v4si)__X, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { + return (__m256i)__builtin_ia32_pmovsxdq256_mask( + (__v4si)__X, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) { + return (__m128i)__builtin_ia32_pmovsxdq128_mask((__v4si)__X, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { + return (__m128i)__builtin_ia32_pmovsxdq128_mask( + (__v4si)__X, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovzxbd256_mask((__v16qi)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovzxbd256_mask( + (__v16qi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovzxbd128_mask((__v16qi)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovzxbd128_mask( + (__v16qi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovzxbq256_mask((__v16qi)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovzxbq256_mask( + (__v16qi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovzxbq128_mask((__v16qi)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovzxbq128_mask( + (__v16qi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovzxwd256_mask((__v8hi)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovzxwd256_mask( + (__v8hi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovzxwd128_mask((__v8hi)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovzxwd128_mask( + (__v8hi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovzxwq256_mask((__v8hi)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { + return (__m256i)__builtin_ia32_pmovzxwq256_mask( + (__v8hi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovzxwq128_mask((__v8hi)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_pmovzxwq128_mask( + (__v8hi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) { + return (__m256i)__builtin_ia32_pmovzxdq256_mask((__v4si)__X, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { + return (__m256i)__builtin_ia32_pmovzxdq256_mask( + (__v4si)__X, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) { + return (__m128i)__builtin_ia32_pmovzxdq128_mask((__v4si)__X, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { + return (__m128i)__builtin_ia32_pmovzxdq128_mask( + (__v4si)__X, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rcp14_pd(__m256d __A) { + return (__m256d)__builtin_ia32_rcp14pd256_mask( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_rcp14_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_rcp14pd256_mask((__v4df)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_rcp14_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_rcp14pd256_mask( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp14_pd(__m128d __A) { + return (__m128d)__builtin_ia32_rcp14pd128_mask( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rcp14_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_rcp14pd128_mask((__v2df)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rcp14_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_rcp14pd128_mask( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rcp14_ps(__m256 __A) { + return (__m256)__builtin_ia32_rcp14ps256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_rcp14_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_rcp14ps256_mask((__v8sf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_rcp14_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_rcp14ps256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp14_ps(__m128 __A) { + return (__m128)__builtin_ia32_rcp14ps128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rcp14_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_rcp14ps128_mask((__v4sf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rcp14_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_rcp14ps128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rsqrt14_pd(__m256d __A) { + return (__m256d)__builtin_ia32_rsqrt14pd256_mask( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_rsqrt14_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_rsqrt14pd256_mask((__v4df)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_rsqrt14_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_rsqrt14pd256_mask( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt14_pd(__m128d __A) { + return (__m128d)__builtin_ia32_rsqrt14pd128_mask( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rsqrt14_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_rsqrt14pd128_mask((__v2df)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rsqrt14_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_rsqrt14pd128_mask( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rsqrt14_ps(__m256 __A) { + return (__m256)__builtin_ia32_rsqrt14ps256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_rsqrt14_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_rsqrt14ps256_mask((__v8sf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_rsqrt14_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_rsqrt14ps256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt14_ps(__m128 __A) { + return (__m128)__builtin_ia32_rsqrt14ps128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rsqrt14_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_rsqrt14ps128_mask((__v4sf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rsqrt14_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_rsqrt14ps128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_sqrtpd256_mask((__v4df)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_sqrtpd256_mask( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_sqrtpd128_mask((__v2df)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_sqrtpd128_mask( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_sqrtps256_mask((__v8sf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_sqrtps256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_sqrtps128_mask((__v4sf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_sqrtps128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddd256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_paddq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubd256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_psubq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddd128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubd128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_getexp_ps(__m256 __A) { + return (__m256)__builtin_ia32_getexpps256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_getexp_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_getexpps256_mask((__v8sf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_getexp_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_getexpps256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_getexp_pd(__m256d __A) { + return (__m256d)__builtin_ia32_getexppd256_mask( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_getexp_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_getexppd256_mask((__v4df)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_getexp_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_getexppd256_mask( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getexp_ps(__m128 __A) { + return (__m128)__builtin_ia32_getexpps128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getexp_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_getexpps128_mask((__v4sf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getexp_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_getexpps128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getexp_pd(__m128d __A) { + return (__m128d)__builtin_ia32_getexppd128_mask( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getexp_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_getexppd128_mask((__v2df)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getexp_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_getexppd128_mask( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrld256_mask((__v8si)__A, (__v4si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrld256_mask( + (__v8si)__A, (__v4si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrld128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrld128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrlq256_mask((__v4di)__A, (__v2di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrlq256_mask( + (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrlq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrlq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pandd256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pandd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_scalef_pd(__m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_scalefpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_scalef_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_scalefpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_scalef_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_scalefpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_scalef_ps(__m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_scalefps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_scalef_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_scalefps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_scalef_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_scalefps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_scalef_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_scalefpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_scalef_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_scalefpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_scalef_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_scalefpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_scalef_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_scalefps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_scalef_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_scalefps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_scalef_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_scalefps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { + return (__m256d)__builtin_ia32_vfmaddpd256_mask3((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddpd256_maskz((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { + return (__m128d)__builtin_ia32_vfmaddpd128_mask3((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddpd128_maskz((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { + return (__m256)__builtin_ia32_vfmaddps256_mask3((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddps256_maskz((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { + return (__m128)__builtin_ia32_vfmaddps128_mask3((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddps128_maskz((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmsubpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { + return (__m256d)__builtin_ia32_vfmsubpd256_mask3((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmsubpd256_maskz((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmsubpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { + return (__m128d)__builtin_ia32_vfmsubpd128_mask3((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmsubpd128_maskz((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmsubps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { + return (__m256)__builtin_ia32_vfmsubps256_mask3((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmsubps256_maskz((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmsubps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { + return (__m128)__builtin_ia32_vfmsubps128_mask3((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmsubps128_maskz((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddsubpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { + return (__m256d)__builtin_ia32_vfmaddsubpd256_mask3( + (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddsubpd256_maskz( + (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsubpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { + return (__m128d)__builtin_ia32_vfmaddsubpd128_mask3( + (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsubpd128_maskz( + (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddsubps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { + return (__m256)__builtin_ia32_vfmaddsubps256_mask3( + (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddsubps256_maskz( + (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddsubps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { + return (__m128)__builtin_ia32_vfmaddsubps128_mask3( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddsubps128_maskz( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddsubpd256_mask( + (__v4df)__A, (__v4df)__B, -(__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { + return (__m256d)__builtin_ia32_vfmsubaddpd256_mask3( + (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddsubpd256_maskz( + (__v4df)__A, (__v4df)__B, -(__v4df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsubpd128_mask( + (__v2df)__A, (__v2df)__B, -(__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { + return (__m128d)__builtin_ia32_vfmsubaddpd128_mask3( + (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsubpd128_maskz( + (__v2df)__A, (__v2df)__B, -(__v2df)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddsubps256_mask( + (__v8sf)__A, (__v8sf)__B, -(__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { + return (__m256)__builtin_ia32_vfmsubaddps256_mask3( + (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddsubps256_maskz( + (__v8sf)__A, (__v8sf)__B, -(__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddsubps128_mask( + (__v4sf)__A, (__v4sf)__B, -(__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { + return (__m128)__builtin_ia32_vfmsubaddps128_mask3( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddsubps128_maskz( + (__v4sf)__A, (__v4sf)__B, -(__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfnmaddpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { + return (__m256d)__builtin_ia32_vfnmaddpd256_mask3((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfnmaddpd256_maskz((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfnmaddpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { + return (__m128d)__builtin_ia32_vfnmaddpd128_mask3((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfnmaddpd128_maskz((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfnmaddps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { + return (__m256)__builtin_ia32_vfnmaddps256_mask3((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfnmaddps256_maskz((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfnmaddps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { + return (__m128)__builtin_ia32_vfnmaddps128_mask3((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfnmaddps128_maskz((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfnmsubpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { + return (__m256d)__builtin_ia32_vfnmsubpd256_mask3((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfnmsubpd256_maskz((__v4df)__A, (__v4df)__B, + (__v4df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfnmsubpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { + return (__m128d)__builtin_ia32_vfnmsubpd128_mask3((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfnmsubpd128_maskz((__v2df)__A, (__v2df)__B, + (__v2df)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfnmsubps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { + return (__m256)__builtin_ia32_vfnmsubps256_mask3((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfnmsubps256_maskz((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfnmsubps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { + return (__m128)__builtin_ia32_vfnmsubps128_mask3((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfnmsubps128_maskz((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pandd128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pandd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pandnd256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pandnd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pandnd128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_andnot_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pandnd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_or_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pord256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pord256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_or_epi32(__m256i __A, __m256i __B) { + return (__m256i)((__v8su)__A | (__v8su)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pord128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pord128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_epi32(__m128i __A, __m128i __B) { + return (__m128i)((__v4su)__A | (__v4su)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pxord256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pxord256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_xor_epi32(__m256i __A, __m256i __B) { + return (__m256i)((__v8su)__A ^ (__v8su)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pxord128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pxord128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_epi32(__m128i __A, __m128i __B) { + return (__m128i)((__v4su)__A ^ (__v4su)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m128d __A) { + return (__m128)__builtin_ia32_cvtpd2ps_mask((__v2df)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtpd_ps(__mmask8 __U, __m128d __A) { + return (__m128)__builtin_ia32_cvtpd2ps_mask( + (__v2df)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m256d __A) { + return (__m128)__builtin_ia32_cvtpd2ps256_mask((__v4df)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtpd_ps(__mmask8 __U, __m256d __A) { + return (__m128)__builtin_ia32_cvtpd2ps256_mask( + (__v4df)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtps_epi32(__m256i __W, __mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_cvtps2dq256_mask((__v8sf)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtps_epi32(__mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_cvtps2dq256_mask( + (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtps_epi32(__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvtps2dq128_mask((__v4sf)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtps_epi32(__mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvtps2dq128_mask( + (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtps_epu32(__m256 __A) { + return (__m256i)__builtin_ia32_cvtps2udq256_mask( + (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtps_epu32(__m256i __W, __mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_cvtps2udq256_mask((__v8sf)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtps_epu32(__mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_cvtps2udq256_mask( + (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_epu32(__m128 __A) { + return (__m128i)__builtin_ia32_cvtps2udq128_mask( + (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtps_epu32(__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvtps2udq128_mask((__v4sf)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtps_epu32(__mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_cvtps2udq128_mask( + (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_movedup_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_movddup256_mask((__v4df)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_movedup_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_movddup256_mask( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_movedup_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_movddup128_mask((__v2df)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_movedup_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_movddup128_mask( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_movshdup256_mask((__v8sf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_movshdup256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_movshdup128_mask((__v4sf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_movshdup128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_movsldup256_mask((__v8sf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_movsldup256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_movsldup128_mask((__v4sf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_movsldup128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhdq128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhdq128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_punpckhdq256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpckhdq256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhqdq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhqdq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_punpckhqdq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpckhqdq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckldq128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckldq128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_punpckldq256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpckldq256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpcklqdq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpcklqdq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_punpcklqdq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_punpcklqdq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epu32_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 0, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi32_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__A, (__v4si)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpeq_epu32_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 0, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpeq_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__A, (__v4si)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epu32_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 0, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epi32_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__A, (__v8si)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpeq_epu32_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 0, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpeq_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__A, (__v8si)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epu64_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 0, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi64_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__A, (__v2di)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpeq_epu64_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 0, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpeq_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__A, (__v2di)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epu64_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 0, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpeq_epi64_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__A, (__v4di)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpeq_epu64_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 0, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpeq_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__A, (__v4di)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epu32_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 6, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi32_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__A, (__v4si)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpgt_epu32_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 6, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpgt_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__A, (__v4si)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epu32_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 6, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epi32_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__A, (__v8si)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpgt_epu32_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 6, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpgt_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__A, (__v8si)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epu64_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 6, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi64_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__A, (__v2di)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpgt_epu64_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 6, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpgt_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__A, (__v2di)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epu64_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 6, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpgt_epi64_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__A, (__v4di)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpgt_epu64_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 6, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpgt_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__A, (__v4di)__B, + __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_test_epi32_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestmd128((__v4si)__A, (__v4si)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_test_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestmd128((__v4si)__A, (__v4si)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_test_epi32_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ptestmd256((__v8si)__A, (__v8si)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_test_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ptestmd256((__v8si)__A, (__v8si)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_test_epi64_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestmq128((__v2di)__A, (__v2di)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_test_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestmq128((__v2di)__A, (__v2di)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_test_epi64_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ptestmq256((__v4di)__A, (__v4di)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_test_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ptestmq256((__v4di)__A, (__v4di)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testn_epi32_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestnmd128((__v4si)__A, (__v4si)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_testn_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestnmd128((__v4si)__A, (__v4si)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testn_epi32_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ptestnmd256((__v8si)__A, (__v8si)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_testn_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ptestnmd256((__v8si)__A, (__v8si)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testn_epi64_mask(__m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestnmq128((__v2di)__A, (__v2di)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_testn_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) { + return (__mmask8)__builtin_ia32_ptestnmq128((__v2di)__A, (__v2di)__B, __U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testn_epi64_mask(__m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ptestnmq256((__v4di)__A, (__v4di)__B, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_testn_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) { + return (__mmask8)__builtin_ia32_ptestnmq256((__v4di)__A, (__v4di)__B, __U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compress_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_compressdf256_mask((__v4df)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_compress_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_compressdf256_mask( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m256d __A) { + __builtin_ia32_compressstoredf256_mask((__v4df *)__P, (__v4df)__A, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compress_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_compressdf128_mask((__v2df)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_compress_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_compressdf128_mask( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m128d __A) { + __builtin_ia32_compressstoredf128_mask((__v2df *)__P, (__v2df)__A, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compress_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_compresssf256_mask((__v8sf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_compress_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_compresssf256_mask( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compressstoreu_ps(void *__P, __mmask8 __U, __m256 __A) { + __builtin_ia32_compressstoresf256_mask((__v8sf *)__P, (__v8sf)__A, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compress_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_compresssf128_mask((__v4sf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_compress_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_compresssf128_mask( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compressstoreu_ps(void *__P, __mmask8 __U, __m128 __A) { + __builtin_ia32_compressstoresf128_mask((__v4sf *)__P, (__v4sf)__A, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compress_epi64(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_compressdi256_mask((__v4di)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_compress_epi64(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_compressdi256_mask( + (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m256i __A) { + __builtin_ia32_compressstoredi256_mask((__v4di *)__P, (__v4di)__A, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compress_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_compressdi128_mask((__v2di)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_compress_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_compressdi128_mask( + (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m128i __A) { + __builtin_ia32_compressstoredi128_mask((__v2di *)__P, (__v2di)__A, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compress_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_compresssi256_mask((__v8si)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_compress_epi32(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_compresssi256_mask( + (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_compressstoreu_epi32(void *__P, __mmask8 __U, __m256i __A) { + __builtin_ia32_compressstoresi256_mask((__v8si *)__P, (__v8si)__A, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compress_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_compresssi128_mask((__v4si)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_compress_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_compresssi128_mask( + (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_compressstoreu_epi32(void *__P, __mmask8 __U, __m128i __A) { + __builtin_ia32_compressstoresi128_mask((__v4si *)__P, (__v4si)__A, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expand_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_expanddf256_mask((__v4df)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expand_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_expanddf256_maskz( + (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expandloadu_pd(__m256d __W, __mmask8 __U, void const *__P) { + return (__m256d)__builtin_ia32_expandloaddf256_mask( + (__v4df *)__P, (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { + return (__m256d)__builtin_ia32_expandloaddf256_maskz( + (__v4df *)__P, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_expanddf128_mask((__v2df)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expand_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_expanddf128_maskz( + (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expandloadu_pd(__m128d __W, __mmask8 __U, void const *__P) { + return (__m128d)__builtin_ia32_expandloaddf128_mask( + (__v2df *)__P, (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { + return (__m128d)__builtin_ia32_expandloaddf128_maskz( + (__v2df *)__P, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expand_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_expandsf256_mask((__v8sf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expand_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_expandsf256_maskz( + (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expandloadu_ps(__m256 __W, __mmask8 __U, void const *__P) { + return (__m256)__builtin_ia32_expandloadsf256_mask((__v8sf *)__P, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expandloadu_ps(__mmask8 __U, void const *__P) { + return (__m256)__builtin_ia32_expandloadsf256_maskz( + (__v8sf *)__P, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expand_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_expandsf128_mask((__v4sf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expand_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_expandsf128_maskz( + (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expandloadu_ps(__m128 __W, __mmask8 __U, void const *__P) { + return (__m128)__builtin_ia32_expandloadsf128_mask((__v4sf *)__P, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expandloadu_ps(__mmask8 __U, void const *__P) { + return (__m128)__builtin_ia32_expandloadsf128_maskz( + (__v4sf *)__P, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expand_epi64(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_expanddi256_mask((__v4di)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expand_epi64(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_expanddi256_maskz( + (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expandloadu_epi64(__m256i __W, __mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_expandloaddi256_mask( + (__v4di *)__P, (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_expandloaddi256_maskz( + (__v4di *)__P, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expand_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_expanddi128_mask((__v2di)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expand_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_expanddi128_maskz( + (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expandloadu_epi64(__m128i __W, __mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_expandloaddi128_mask( + (__v2di *)__P, (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_expandloaddi128_maskz( + (__v2di *)__P, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expand_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_expandsi256_mask((__v8si)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expand_epi32(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_expandsi256_maskz( + (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_expandloadu_epi32(__m256i __W, __mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_expandloadsi256_mask( + (__v8si *)__P, (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) { + return (__m256i)__builtin_ia32_expandloadsi256_maskz( + (__v8si *)__P, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expand_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_expandsi128_mask((__v4si)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expand_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_expandsi128_maskz( + (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_expandloadu_epi32(__m128i __W, __mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_expandloadsi128_mask( + (__v4si *)__P, (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) { + return (__m128i)__builtin_ia32_expandloadsi128_maskz( + (__v4si *)__P, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { + return (__m256d)__builtin_ia32_vpermt2varpd256_mask((__v4di)__I + /* idx */, + (__v4df)__A, (__v4df)__B, + (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, + __m256d __B) { + return (__m256d)__builtin_ia32_vpermt2varpd256_mask((__v4di)__I + /* idx */, + (__v4df)__A, (__v4df)__B, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U, + __m256d __B) { + return (__m256d)__builtin_ia32_vpermi2varpd256_mask((__v4df)__A, + (__v4di)__I + /* idx */, + (__v4df)__B, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, + __m256d __B) { + return (__m256d)__builtin_ia32_vpermt2varpd256_maskz((__v4di)__I + /* idx */, + (__v4df)__A, (__v4df)__B, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { + return (__m256)__builtin_ia32_vpermt2varps256_mask((__v8si)__I + /* idx */, + (__v8sf)__A, (__v8sf)__B, + (__mmask8)-1); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { + return (__m256)__builtin_ia32_vpermt2varps256_mask((__v8si)__I + /* idx */, + (__v8sf)__A, (__v8sf)__B, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U, + __m256 __B) { + return (__m256)__builtin_ia32_vpermi2varps256_mask((__v8sf)__A, + (__v8si)__I + /* idx */, + (__v8sf)__B, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, + __m256 __B) { + return (__m256)__builtin_ia32_vpermt2varps256_maskz((__v8si)__I + /* idx */, + (__v8sf)__A, (__v8sf)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2varq128_mask((__v2di)__I + /* idx */, + (__v2di)__A, (__v2di)__B, + (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2varq128_mask((__v2di)__I + /* idx */, + (__v2di)__A, (__v2di)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermi2varq128_mask((__v2di)__A, + (__v2di)__I + /* idx */, + (__v2di)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2varq128_maskz((__v2di)__I + /* idx */, + (__v2di)__A, (__v2di)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2vard128_mask((__v4si)__I + /* idx */, + (__v4si)__A, (__v4si)__B, + (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2vard128_mask((__v4si)__I + /* idx */, + (__v4si)__A, (__v4si)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermi2vard128_mask((__v4si)__A, + (__v4si)__I + /* idx */, + (__v4si)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_vpermt2vard128_maskz((__v4si)__I + /* idx */, + (__v4si)__A, (__v4si)__B, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2varq256_mask((__v4di)__I + /* idx */, + (__v4di)__A, (__v4di)__B, + (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2varq256_mask((__v4di)__I + /* idx */, + (__v4di)__A, (__v4di)__B, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermi2varq256_mask((__v4di)__A, + (__v4di)__I + /* idx */, + (__v4di)__B, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2varq256_maskz((__v4di)__I + /* idx */, + (__v4di)__A, (__v4di)__B, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2vard256_mask((__v8si)__I + /* idx */, + (__v8si)__A, (__v8si)__B, + (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2vard256_mask((__v8si)__I + /* idx */, + (__v8si)__A, (__v8si)__B, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermi2vard256_mask((__v8si)__A, + (__v8si)__I + /* idx */, + (__v8si)__B, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_vpermt2vard256_maskz((__v8si)__I + /* idx */, + (__v8si)__A, (__v8si)__B, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { + return (__m128d)__builtin_ia32_vpermt2varpd128_mask((__v2di)__I + /* idx */, + (__v2df)__A, (__v2df)__B, + (__mmask8)-1); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { + return (__m128d)__builtin_ia32_vpermt2varpd128_mask((__v2di)__I + /* idx */, + (__v2df)__A, (__v2df)__B, + (__mmask8)__U); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { + return (__m128d)__builtin_ia32_vpermi2varpd128_mask((__v2df)__A, + (__v2di)__I + /* idx */, + (__v2df)__B, + (__mmask8)__U); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { + return (__m128d)__builtin_ia32_vpermt2varpd128_maskz((__v2di)__I + /* idx */, + (__v2df)__A, (__v2df)__B, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { + return (__m128)__builtin_ia32_vpermt2varps128_mask((__v4si)__I + /* idx */, + (__v4sf)__A, (__v4sf)__B, + (__mmask8)-1); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { + return (__m128)__builtin_ia32_vpermt2varps128_mask((__v4si)__I + /* idx */, + (__v4sf)__A, (__v4sf)__B, + (__mmask8)__U); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { + return (__m128)__builtin_ia32_vpermi2varps128_mask((__v4sf)__A, + (__v4si)__I + /* idx */, + (__v4sf)__B, + (__mmask8)__U); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { + return (__m128)__builtin_ia32_vpermt2varps128_maskz((__v4si)__I + /* idx */, + (__v4sf)__A, (__v4sf)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srav_epi64(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psravq128_mask( + (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psravq128_mask((__v2di)__X, (__v2di)__Y, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psravq128_mask( + (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psllv8si_mask((__v8si)__X, (__v8si)__Y, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psllv8si_mask( + (__v8si)__X, (__v8si)__Y, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psllv4si_mask((__v4si)__X, (__v4si)__Y, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psllv4si_mask( + (__v4si)__X, (__v4si)__Y, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psllv4di_mask((__v4di)__X, (__v4di)__Y, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psllv4di_mask( + (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psllv2di_mask((__v2di)__X, (__v2di)__Y, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psllv2di_mask( + (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psrav8si_mask((__v8si)__X, (__v8si)__Y, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psrav8si_mask( + (__v8si)__X, (__v8si)__Y, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psrav4si_mask((__v4si)__X, (__v4si)__Y, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psrav4si_mask( + (__v4si)__X, (__v4si)__Y, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psrlv8si_mask((__v8si)__X, (__v8si)__Y, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psrlv8si_mask( + (__v8si)__X, (__v8si)__Y, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psrlv4si_mask((__v4si)__X, (__v4si)__Y, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psrlv4si_mask( + (__v4si)__X, (__v4si)__Y, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psrlv4di_mask((__v4di)__X, (__v4di)__Y, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psrlv4di_mask( + (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psrlv2di_mask((__v2di)__X, (__v2di)__Y, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psrlv2di_mask( + (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rolv_epi32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prolvd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_rolv_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prolvd256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_rolv_epi32(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prolvd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rolv_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prolvd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rolv_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prolvd128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rolv_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prolvd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rorv_epi32(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prorvd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_rorv_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prorvd256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_rorv_epi32(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prorvd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rorv_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prorvd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rorv_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prorvd128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rorv_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prorvd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rolv_epi64(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prolvq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_rolv_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prolvq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_rolv_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prolvq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rolv_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prolvq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rolv_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prolvq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rolv_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prolvq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rorv_epi64(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prorvq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_rorv_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prorvq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_rorv_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_prorvq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rorv_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prorvq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rorv_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prorvq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rorv_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_prorvq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srav_epi64(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psravq256_mask( + (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psravq256_mask((__v4di)__X, (__v4di)__Y, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srav_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_psravq256_mask( + (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pandq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, __U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pandq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_pd(), __U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pandq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, __U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pandq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)_mm_setzero_pd(), __U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pandnq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, __U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pandnq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_pd(), __U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pandnq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, __U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pandnq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)_mm_setzero_pd(), __U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_porq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_porq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_or_epi64(__m256i __A, __m256i __B) { + return (__m256i)((__v4du)__A | (__v4du)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_porq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_porq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_epi64(__m128i __A, __m128i __B) { + return (__m128i)((__v2du)__A | (__v2du)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pxorq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pxorq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_xor_epi64(__m256i __A, __m256i __B) { + return (__m256i)((__v4du)__A ^ (__v4du)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pxorq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pxorq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_epi64(__m128i __A, __m128i __B) { + return (__m128i)((__v2du)__A ^ (__v2du)__B); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_maxpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_maxpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_maxps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_maxps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_divps_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_divps_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_divpd_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_divpd_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_minpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_divpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_minpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_minps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_divpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_divps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_minps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_divps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_minps_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_mulps_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_minps_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_mulps_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_maxps_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_maxps_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_minpd_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_minpd_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_maxpd_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_maxpd_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_mulpd_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_mulpd_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_mulps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_mulps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_mulpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_mulpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_max_epi64(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_max_epi64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_min_epi64(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_min_epi64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_min_epi64(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_max_epu64(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxuq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_max_epi64(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_max_epu64(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxuq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_max_epu64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxuq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_min_epu64(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminuq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_min_epu64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminuq256_mask((__v4di)__A, (__v4di)__B, + (__v4di)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_min_epu64(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminuq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsd256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminsd256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxud256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxud256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminud256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pminud256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_epi64(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_epi64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminsq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_epi64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminsq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_epi64(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminsq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_epu64(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxuq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epu64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxuq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_epu64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxuq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epu64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminuq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_epu64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminuq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_epu64(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminuq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsd128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminsd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminsd128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxud128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxud128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminud128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminud128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, __M); +} + +#ifndef __AVX512CD__ +#pragma GCC push_options +#pragma GCC target("avx512vl,avx512cd") +#define __DISABLE_AVX512VLCD__ +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_broadcastmb_epi64(__mmask8 __A) { + return (__m128i)__builtin_ia32_broadcastmb128(__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcastmb_epi64(__mmask8 __A) { + return (__m256i)__builtin_ia32_broadcastmb256(__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_broadcastmw_epi32(__mmask16 __A) { + return (__m128i)__builtin_ia32_broadcastmw128(__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcastmw_epi32(__mmask16 __A) { + return (__m256i)__builtin_ia32_broadcastmw256(__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_lzcnt_epi32(__m256i __A) { + return (__m256i)__builtin_ia32_vplzcntd_256_mask( + (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_lzcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vplzcntd_256_mask((__v8si)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_lzcnt_epi32(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vplzcntd_256_mask( + (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_lzcnt_epi64(__m256i __A) { + return (__m256i)__builtin_ia32_vplzcntq_256_mask( + (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_lzcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vplzcntq_256_mask((__v4di)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vplzcntq_256_mask( + (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_conflict_epi64(__m256i __A) { + return (__m256i)__builtin_ia32_vpconflictdi_256_mask( + (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_conflict_epi64(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vpconflictdi_256_mask((__v4di)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_conflict_epi64(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vpconflictdi_256_mask( + (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_conflict_epi32(__m256i __A) { + return (__m256i)__builtin_ia32_vpconflictsi_256_mask( + (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_conflict_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vpconflictsi_256_mask((__v8si)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_conflict_epi32(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vpconflictsi_256_mask( + (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_lzcnt_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_vplzcntd_128_mask( + (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_lzcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vplzcntd_128_mask((__v4si)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_lzcnt_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vplzcntd_128_mask( + (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_lzcnt_epi64(__m128i __A) { + return (__m128i)__builtin_ia32_vplzcntq_128_mask( + (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_lzcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vplzcntq_128_mask((__v2di)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_lzcnt_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vplzcntq_128_mask( + (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_conflict_epi64(__m128i __A) { + return (__m128i)__builtin_ia32_vpconflictdi_128_mask( + (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_conflict_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vpconflictdi_128_mask((__v2di)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_conflict_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vpconflictdi_128_mask( + (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_conflict_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_vpconflictsi_128_mask( + (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_conflict_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vpconflictsi_128_mask((__v4si)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_conflict_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vpconflictsi_128_mask( + (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +#ifdef __DISABLE_AVX512VLCD__ +#pragma GCC pop_options +#endif + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_unpcklpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_unpcklpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_unpcklpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_unpcklpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_unpcklps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_unpckhpd256_mask((__v4df)__A, (__v4df)__B, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_unpckhpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_unpckhpd128_mask((__v2df)__A, (__v2df)__B, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_unpckhpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_unpckhps256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_unpckhps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_unpckhps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_unpckhps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_vcvtph2ps_mask((__v8hi)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_vcvtph2ps_mask( + (__v8hi)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_unpcklps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtph_ps(__m256 __W, __mmask8 __U, __m128i __A) { + return (__m256)__builtin_ia32_vcvtph2ps256_mask((__v8hi)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { + return (__m256)__builtin_ia32_vcvtph2ps256_mask( + (__v8hi)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_unpcklps128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_unpcklps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrad256_mask((__v8si)__A, (__v4si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psrad256_mask( + (__v8si)__A, (__v4si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrad128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrad128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sra_epi64(__m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psraq256_mask( + (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psraq256_mask((__v4di)__A, (__v2di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psraq256_mask( + (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sra_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psraq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psraq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psraq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pslld128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pslld128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psllq128_mask((__v2di)__A, (__v2di)__B, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psllq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_pslld256_mask((__v8si)__A, (__v4si)__B, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_pslld256_mask( + (__v8si)__A, (__v4si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psllq256_mask((__v4di)__A, (__v2di)__B, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) { + return (__m256i)__builtin_ia32_psllq256_mask( + (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) { + return (__m256)__builtin_ia32_permvarsf256_mask((__v8sf)__Y, (__v8si)__X, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) { + return (__m256)__builtin_ia32_permvarsf256_mask( + (__v8sf)__Y, (__v8si)__X, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutexvar_pd(__m256i __X, __m256d __Y) { + return (__m256d)__builtin_ia32_permvardf256_mask( + (__v4df)__Y, (__v4di)__X, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutexvar_pd(__m256d __W, __mmask8 __U, __m256i __X, + __m256d __Y) { + return (__m256d)__builtin_ia32_permvardf256_mask((__v4df)__Y, (__v4di)__X, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutexvar_pd(__mmask8 __U, __m256i __X, __m256d __Y) { + return (__m256d)__builtin_ia32_permvardf256_mask( + (__v4df)__Y, (__v4di)__X, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) { + return (__m256d)__builtin_ia32_vpermilvarpd256_mask( + (__v4df)__A, (__v4di)__C, (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) { + return (__m256d)__builtin_ia32_vpermilvarpd256_mask( + (__v4df)__A, (__v4di)__C, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) { + return (__m256)__builtin_ia32_vpermilvarps256_mask( + (__v8sf)__A, (__v8si)__C, (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) { + return (__m256)__builtin_ia32_vpermilvarps256_mask( + (__v8sf)__A, (__v8si)__C, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) { + return (__m128d)__builtin_ia32_vpermilvarpd_mask((__v2df)__A, (__v2di)__C, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) { + return (__m128d)__builtin_ia32_vpermilvarpd_mask( + (__v2df)__A, (__v2di)__C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) { + return (__m128)__builtin_ia32_vpermilvarps_mask((__v4sf)__A, (__v4si)__C, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) { + return (__m128)__builtin_ia32_vpermilvarps_mask( + (__v4sf)__A, (__v4si)__C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmulld256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutexvar_epi64(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_permvardi256_mask( + (__v4di)__Y, (__v4di)__X, (__v4di)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmulld256_mask((__v8si)__A, (__v8si)__B, + (__v8si)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmulld128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmulld128_mask((__v4si)__A, (__v4si)__B, + (__v4si)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pmuldq256_mask((__v8si)__X, (__v8si)__Y, + (__v4di)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pmuldq256_mask( + (__v8si)__X, (__v8si)__Y, (__v4di)_mm256_setzero_si256(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmuldq128_mask((__v4si)__X, (__v4si)__Y, + (__v2di)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmuldq128_mask( + (__v4si)__X, (__v4si)__Y, (__v2di)_mm_setzero_si128(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutexvar_epi64(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_permvardi256_mask( + (__v4di)__Y, (__v4di)__X, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutexvar_epi64(__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) { + return (__m256i)__builtin_ia32_permvardi256_mask((__v4di)__Y, (__v4di)__X, + (__v4di)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pmuludq256_mask((__v8si)__X, (__v8si)__Y, + (__v4di)__W, __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_permvarsi256_mask( + (__v8si)__Y, (__v8si)__X, (__v8si)_mm256_setzero_si256(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_pmuludq256_mask( + (__v8si)__X, (__v8si)__Y, (__v4di)_mm256_setzero_si256(), __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmuludq128_mask((__v4si)__X, (__v4si)__Y, + (__v2di)__W, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmuludq128_mask( + (__v4si)__X, (__v4si)__Y, (__v2di)_mm_setzero_si128(), __M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutexvar_epi32(__m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_permvarsi256_mask( + (__v8si)__Y, (__v8si)__X, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) { + return (__m256i)__builtin_ia32_permvarsi256_mask((__v8si)__Y, (__v8si)__X, + (__v8si)__W, __M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpneq_epu32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpneq_epu32_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmplt_epu32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmplt_epu32_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpge_epu32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpge_epu32_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmple_epu32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmple_epu32_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 2, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpneq_epu64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpneq_epu64_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmplt_epu64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmplt_epu64_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpge_epu64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpge_epu64_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmple_epu64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmple_epu64_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 2, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpneq_epi32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpneq_epi32_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmplt_epi32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmplt_epi32_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpge_epi32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpge_epi32_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmple_epi32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmple_epi32_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 2, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpneq_epi64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpneq_epi64_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmplt_epi64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmplt_epi64_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmpge_epi64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmpge_epi64_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmple_epi64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmple_epi64_mask(__m256i __X, __m256i __Y) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 2, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpneq_epu32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_epu32_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmplt_epu32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epu32_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpge_epu32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_epu32_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmple_epu32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_epu32_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 2, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpneq_epu64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_epu64_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmplt_epu64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epu64_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpge_epu64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_epu64_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmple_epu64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_epu64_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 2, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpneq_epi32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_epi32_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmplt_epi32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epi32_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpge_epi32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_epi32_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmple_epi32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_epi32_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 2, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpneq_epi64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 4, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_epi64_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 4, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmplt_epi64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 1, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epi64_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 1, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmpge_epi64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 5, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_epi64_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 5, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmple_epi64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 2, + (__mmask8)__M); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_epi64_mask(__m128i __X, __m128i __Y) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 2, + (__mmask8)-1); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutex_epi64(__m256i __X, const int __I) { + return (__m256i)__builtin_ia32_permdi256_mask( + (__v4di)__X, __I, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutex_epi64(__m256i __W, __mmask8 __M, __m256i __X, + const int __I) { + return (__m256i)__builtin_ia32_permdi256_mask((__v4di)__X, __I, (__v4di)__W, + (__mmask8)__M); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutex_epi64(__mmask8 __M, __m256i __X, const int __I) { + return (__m256i)__builtin_ia32_permdi256_mask( + (__v4di)__X, __I, (__v4di)_mm256_setzero_si256(), (__mmask8)__M); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shuffle_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B, + const int __imm) { + return (__m256d)__builtin_ia32_shufpd256_mask((__v4df)__A, (__v4df)__B, __imm, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shuffle_pd(__mmask8 __U, __m256d __A, __m256d __B, + const int __imm) { + return (__m256d)__builtin_ia32_shufpd256_mask((__v4df)__A, (__v4df)__B, __imm, + (__v4df)_mm256_setzero_pd(), + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shuffle_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __imm) { + return (__m128d)__builtin_ia32_shufpd128_mask((__v2df)__A, (__v2df)__B, __imm, + (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_shuffle_pd(__mmask8 __U, __m128d __A, __m128d __B, const int __imm) { + return (__m128d)__builtin_ia32_shufpd128_mask( + (__v2df)__A, (__v2df)__B, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shuffle_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B, + const int __imm) { + return (__m256)__builtin_ia32_shufps256_mask((__v8sf)__A, (__v8sf)__B, __imm, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_shuffle_ps(__mmask8 __U, __m256 __A, __m256 __B, const int __imm) { + return (__m256)__builtin_ia32_shufps256_mask((__v8sf)__A, (__v8sf)__B, __imm, + (__v8sf)_mm256_setzero_ps(), + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shuffle_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __imm) { + return (__m128)__builtin_ia32_shufps128_mask((__v4sf)__A, (__v4sf)__B, __imm, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_shuffle_ps(__mmask8 __U, __m128 __A, __m128 __B, const int __imm) { + return (__m128)__builtin_ia32_shufps128_mask( + (__v4sf)__A, (__v4sf)__B, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_inserti32x4(__m256i __A, __m128i __B, const int __imm) { + return (__m256i)__builtin_ia32_inserti32x4_256_mask( + (__v8si)__A, (__v4si)__B, __imm, (__v8si)_mm256_setzero_si256(), + (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_inserti32x4(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B, + const int __imm) { + return (__m256i)__builtin_ia32_inserti32x4_256_mask( + (__v8si)__A, (__v4si)__B, __imm, (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_inserti32x4(__mmask8 __U, __m256i __A, __m128i __B, + const int __imm) { + return (__m256i)__builtin_ia32_inserti32x4_256_mask( + (__v8si)__A, (__v4si)__B, __imm, (__v8si)_mm256_setzero_si256(), + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_insertf32x4(__m256 __A, __m128 __B, const int __imm) { + return (__m256)__builtin_ia32_insertf32x4_256_mask( + (__v8sf)__A, (__v4sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), + (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_insertf32x4(__m256 __W, __mmask8 __U, __m256 __A, __m128 __B, + const int __imm) { + return (__m256)__builtin_ia32_insertf32x4_256_mask( + (__v8sf)__A, (__v4sf)__B, __imm, (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_insertf32x4(__mmask8 __U, __m256 __A, __m128 __B, + const int __imm) { + return (__m256)__builtin_ia32_insertf32x4_256_mask( + (__v8sf)__A, (__v4sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extracti32x4_epi32(__m256i __A, const int __imm) { + return (__m128i)__builtin_ia32_extracti32x4_256_mask( + (__v8si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A, + const int __imm) { + return (__m128i)__builtin_ia32_extracti32x4_256_mask( + (__v8si)__A, __imm, (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A, const int __imm) { + return (__m128i)__builtin_ia32_extracti32x4_256_mask( + (__v8si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extractf32x4_ps(__m256 __A, const int __imm) { + return (__m128)__builtin_ia32_extractf32x4_256_mask( + (__v8sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A, + const int __imm) { + return (__m128)__builtin_ia32_extractf32x4_256_mask( + (__v8sf)__A, __imm, (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A, const int __imm) { + return (__m128)__builtin_ia32_extractf32x4_256_mask( + (__v8sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shuffle_i64x2(__m256i __A, __m256i __B, const int __imm) { + return (__m256i)__builtin_ia32_shuf_i64x2_256_mask( + (__v4di)__A, (__v4di)__B, __imm, (__v4di)_mm256_setzero_si256(), + (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) { + return (__m256i)__builtin_ia32_shuf_i64x2_256_mask( + (__v4di)__A, (__v4di)__B, __imm, (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) { + return (__m256i)__builtin_ia32_shuf_i64x2_256_mask( + (__v4di)__A, (__v4di)__B, __imm, (__v4di)_mm256_setzero_si256(), + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shuffle_i32x4(__m256i __A, __m256i __B, const int __imm) { + return (__m256i)__builtin_ia32_shuf_i32x4_256_mask( + (__v8si)__A, (__v8si)__B, __imm, (__v8si)_mm256_setzero_si256(), + (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) { + return (__m256i)__builtin_ia32_shuf_i32x4_256_mask( + (__v8si)__A, (__v8si)__B, __imm, (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) { + return (__m256i)__builtin_ia32_shuf_i32x4_256_mask( + (__v8si)__A, (__v8si)__B, __imm, (__v8si)_mm256_setzero_si256(), + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shuffle_f64x2(__m256d __A, __m256d __B, const int __imm) { + return (__m256d)__builtin_ia32_shuf_f64x2_256_mask( + (__v4df)__A, (__v4df)__B, __imm, (__v4df)_mm256_setzero_pd(), + (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B, const int __imm) { + return (__m256d)__builtin_ia32_shuf_f64x2_256_mask( + (__v4df)__A, (__v4df)__B, __imm, (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B, + const int __imm) { + return (__m256d)__builtin_ia32_shuf_f64x2_256_mask( + (__v4df)__A, (__v4df)__B, __imm, (__v4df)_mm256_setzero_pd(), + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shuffle_f32x4(__m256 __A, __m256 __B, const int __imm) { + return (__m256)__builtin_ia32_shuf_f32x4_256_mask( + (__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), + (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B, + const int __imm) { + return (__m256)__builtin_ia32_shuf_f32x4_256_mask( + (__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B, + const int __imm) { + return (__m256)__builtin_ia32_shuf_f32x4_256_mask( + (__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fixupimm_pd(__m256d __A, __m256d __B, __m256i __C, const int __imm) { + return (__m256d)__builtin_ia32_fixupimmpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4di)__C, __imm, (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fixupimm_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256i __C, + const int __imm) { + return (__m256d)__builtin_ia32_fixupimmpd256_mask( + (__v4df)__A, (__v4df)__B, (__v4di)__C, __imm, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_fixupimm_pd(__mmask8 __U, __m256d __A, __m256d __B, + __m256i __C, const int __imm) { + return (__m256d)__builtin_ia32_fixupimmpd256_maskz( + (__v4df)__A, (__v4df)__B, (__v4di)__C, __imm, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fixupimm_ps(__m256 __A, __m256 __B, __m256i __C, const int __imm) { + return (__m256)__builtin_ia32_fixupimmps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8si)__C, __imm, (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_fixupimm_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256i __C, + const int __imm) { + return (__m256)__builtin_ia32_fixupimmps256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8si)__C, __imm, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_fixupimm_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256i __C, + const int __imm) { + return (__m256)__builtin_ia32_fixupimmps256_maskz( + (__v8sf)__A, (__v8sf)__B, (__v8si)__C, __imm, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C, const int __imm) { + return (__m128d)__builtin_ia32_fixupimmpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fixupimm_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C, + const int __imm) { + return (__m128d)__builtin_ia32_fixupimmpd128_mask( + (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fixupimm_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C, + const int __imm) { + return (__m128d)__builtin_ia32_fixupimmpd128_maskz( + (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fixupimm_ps(__m128 __A, __m128 __B, __m128i __C, const int __imm) { + return (__m128)__builtin_ia32_fixupimmps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fixupimm_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C, + const int __imm) { + return (__m128)__builtin_ia32_fixupimmps128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_fixupimm_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C, + const int __imm) { + return (__m128)__builtin_ia32_fixupimmps128_maskz( + (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) { + return (__m256i)__builtin_ia32_psrldi256_mask((__v8si)__A, __imm, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, const int __imm) { + return (__m256i)__builtin_ia32_psrldi256_mask( + (__v8si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psrldi128_mask((__v4si)__A, __imm, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psrldi128_mask( + (__v4si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) { + return (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)__A, __imm, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, const int __imm) { + return (__m256i)__builtin_ia32_psrlqi256_mask( + (__v4di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)__A, __imm, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psrlqi128_mask( + (__v2di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_ternarylogic_epi64(__m256i __A, __m256i __B, __m256i __C, + const int __imm) { + return (__m256i)__builtin_ia32_pternlogq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)__C, __imm, (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_ternarylogic_epi64(__m256i __A, __mmask8 __U, __m256i __B, + __m256i __C, const int __imm) { + return (__m256i)__builtin_ia32_pternlogq256_mask( + (__v4di)__A, (__v4di)__B, (__v4di)__C, __imm, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_ternarylogic_epi64(__mmask8 __U, __m256i __A, __m256i __B, + __m256i __C, const int __imm) { + return (__m256i)__builtin_ia32_pternlogq256_maskz( + (__v4di)__A, (__v4di)__B, (__v4di)__C, __imm, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_ternarylogic_epi32(__m256i __A, __m256i __B, __m256i __C, + const int __imm) { + return (__m256i)__builtin_ia32_pternlogd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)__C, __imm, (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_ternarylogic_epi32(__m256i __A, __mmask8 __U, __m256i __B, + __m256i __C, const int __imm) { + return (__m256i)__builtin_ia32_pternlogd256_mask( + (__v8si)__A, (__v8si)__B, (__v8si)__C, __imm, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_ternarylogic_epi32(__mmask8 __U, __m256i __A, __m256i __B, + __m256i __C, const int __imm) { + return (__m256i)__builtin_ia32_pternlogd256_maskz( + (__v8si)__A, (__v8si)__B, (__v8si)__C, __imm, (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_ternarylogic_epi64(__m128i __A, __m128i __B, __m128i __C, const int __imm) { + return (__m128i)__builtin_ia32_pternlogq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)__C, __imm, (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_ternarylogic_epi64(__m128i __A, __mmask8 __U, __m128i __B, + __m128i __C, const int __imm) { + return (__m128i)__builtin_ia32_pternlogq128_mask( + (__v2di)__A, (__v2di)__B, (__v2di)__C, __imm, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_ternarylogic_epi64(__mmask8 __U, __m128i __A, __m128i __B, + __m128i __C, const int __imm) { + return (__m128i)__builtin_ia32_pternlogq128_maskz( + (__v2di)__A, (__v2di)__B, (__v2di)__C, __imm, (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_ternarylogic_epi32(__m128i __A, __m128i __B, __m128i __C, const int __imm) { + return (__m128i)__builtin_ia32_pternlogd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)__C, __imm, (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_ternarylogic_epi32(__m128i __A, __mmask8 __U, __m128i __B, + __m128i __C, const int __imm) { + return (__m128i)__builtin_ia32_pternlogd128_mask( + (__v4si)__A, (__v4si)__B, (__v4si)__C, __imm, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_ternarylogic_epi32(__mmask8 __U, __m128i __A, __m128i __B, + __m128i __C, const int __imm) { + return (__m128i)__builtin_ia32_pternlogd128_maskz( + (__v4si)__A, (__v4si)__B, (__v4si)__C, __imm, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_roundscale_ps(__m256 __A, const int __imm) { + return (__m256)__builtin_ia32_rndscaleps_256_mask( + (__v8sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_roundscale_ps(__m256 __W, __mmask8 __U, __m256 __A, + const int __imm) { + return (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)__A, __imm, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_roundscale_ps(__mmask8 __U, __m256 __A, const int __imm) { + return (__m256)__builtin_ia32_rndscaleps_256_mask( + (__v8sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_roundscale_pd(__m256d __A, const int __imm) { + return (__m256d)__builtin_ia32_rndscalepd_256_mask( + (__v4df)__A, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_roundscale_pd(__m256d __W, __mmask8 __U, __m256d __A, + const int __imm) { + return (__m256d)__builtin_ia32_rndscalepd_256_mask( + (__v4df)__A, __imm, (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_roundscale_pd(__mmask8 __U, __m256d __A, const int __imm) { + return (__m256d)__builtin_ia32_rndscalepd_256_mask( + (__v4df)__A, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roundscale_ps(__m128 __A, const int __imm) { + return (__m128)__builtin_ia32_rndscaleps_128_mask( + (__v4sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_roundscale_ps(__m128 __W, __mmask8 __U, __m128 __A, const int __imm) { + return (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)__A, __imm, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_roundscale_ps(__mmask8 __U, __m128 __A, const int __imm) { + return (__m128)__builtin_ia32_rndscaleps_128_mask( + (__v4sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roundscale_pd(__m128d __A, const int __imm) { + return (__m128d)__builtin_ia32_rndscalepd_128_mask( + (__v2df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_roundscale_pd(__m128d __W, __mmask8 __U, __m128d __A, + const int __imm) { + return (__m128d)__builtin_ia32_rndscalepd_128_mask( + (__v2df)__A, __imm, (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_roundscale_pd(__mmask8 __U, __m128d __A, const int __imm) { + return (__m128d)__builtin_ia32_rndscalepd_128_mask( + (__v2df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_getmant_ps(__m256 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m256)__builtin_ia32_getmantps256_mask( + (__v8sf)__A, (__C << 2) | __B, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_getmant_ps(__m256 __W, __mmask8 __U, __m256 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m256)__builtin_ia32_getmantps256_mask((__v8sf)__A, (__C << 2) | __B, + (__v8sf)__W, (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_getmant_ps(__mmask8 __U, __m256 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m256)__builtin_ia32_getmantps256_mask((__v8sf)__A, (__C << 2) | __B, + (__v8sf)_mm256_setzero_ps(), + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getmant_ps(__m128 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m128)__builtin_ia32_getmantps128_mask( + (__v4sf)__A, (__C << 2) | __B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getmant_ps(__m128 __W, __mmask8 __U, __m128 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m128)__builtin_ia32_getmantps128_mask((__v4sf)__A, (__C << 2) | __B, + (__v4sf)__W, (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getmant_ps(__mmask8 __U, __m128 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m128)__builtin_ia32_getmantps128_mask( + (__v4sf)__A, (__C << 2) | __B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_getmant_pd(__m256d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m256d)__builtin_ia32_getmantpd256_mask( + (__v4df)__A, (__C << 2) | __B, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_getmant_pd(__m256d __W, __mmask8 __U, __m256d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m256d)__builtin_ia32_getmantpd256_mask( + (__v4df)__A, (__C << 2) | __B, (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_getmant_pd(__mmask8 __U, __m256d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m256d)__builtin_ia32_getmantpd256_mask( + (__v4df)__A, (__C << 2) | __B, (__v4df)_mm256_setzero_pd(), + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getmant_pd(__m128d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m128d)__builtin_ia32_getmantpd128_mask( + (__v2df)__A, (__C << 2) | __B, (__v2df)_mm_setzero_pd(), (__mmask8)-1); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_getmant_pd(__m128d __W, __mmask8 __U, __m128d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m128d)__builtin_ia32_getmantpd128_mask( + (__v2df)__A, (__C << 2) | __B, (__v2df)__W, (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_getmant_pd(__mmask8 __U, __m128d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { + return (__m128d)__builtin_ia32_getmantpd128_mask( + (__v2df)__A, (__C << 2) | __B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mmask_i32gather_ps(__m256 __v1_old, __mmask8 __mask, __m256i __index, + void const *__addr, int __scale) { + return (__m256)__builtin_ia32_gather3siv8sf((__v8sf)__v1_old, __addr, + (__v8si)__index, __mask, __scale); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mmask_i32gather_ps(__m128 __v1_old, __mmask8 __mask, __m128i __index, + void const *__addr, int __scale) { + return (__m128)__builtin_ia32_gather3siv4sf((__v4sf)__v1_old, __addr, + (__v4si)__index, __mask, __scale); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mmask_i32gather_pd(__m256d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { + return (__m256d)__builtin_ia32_gather3siv4df( + (__v4df)__v1_old, __addr, (__v4si)__index, __mask, __scale); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mmask_i32gather_pd(__m128d __v1_old, __mmask8 __mask, __m128i __index, + void const *__addr, int __scale) { + return (__m128d)__builtin_ia32_gather3siv2df( + (__v2df)__v1_old, __addr, (__v4si)__index, __mask, __scale); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, __m256i __index, + void const *__addr, int __scale) { + return (__m128)__builtin_ia32_gather3div8sf((__v4sf)__v1_old, __addr, + (__v4di)__index, __mask, __scale); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, __m128i __index, + void const *__addr, int __scale) { + return (__m128)__builtin_ia32_gather3div4sf((__v4sf)__v1_old, __addr, + (__v2di)__index, __mask, __scale); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mmask_i64gather_pd(__m256d __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { + return (__m256d)__builtin_ia32_gather3div4df( + (__v4df)__v1_old, __addr, (__v4di)__index, __mask, __scale); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mmask_i64gather_pd(__m128d __v1_old, __mmask8 __mask, __m128i __index, + void const *__addr, int __scale) { + return (__m128d)__builtin_ia32_gather3div2df( + (__v2df)__v1_old, __addr, (__v2di)__index, __mask, __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mmask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { + return (__m256i)__builtin_ia32_gather3siv8si( + (__v8si)__v1_old, __addr, (__v8si)__index, __mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mmask_i32gather_epi32(__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { + return (__m128i)__builtin_ia32_gather3siv4si( + (__v4si)__v1_old, __addr, (__v4si)__index, __mask, __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mmask_i32gather_epi64(__m256i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { + return (__m256i)__builtin_ia32_gather3siv4di( + (__v4di)__v1_old, __addr, (__v4si)__index, __mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mmask_i32gather_epi64(__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { + return (__m128i)__builtin_ia32_gather3siv2di( + (__v2di)__v1_old, __addr, (__v4si)__index, __mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { + return (__m128i)__builtin_ia32_gather3div8si( + (__v4si)__v1_old, __addr, (__v4di)__index, __mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { + return (__m128i)__builtin_ia32_gather3div4si( + (__v4si)__v1_old, __addr, (__v2di)__index, __mask, __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mmask_i64gather_epi64(__m256i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { + return (__m256i)__builtin_ia32_gather3div4di( + (__v4di)__v1_old, __addr, (__v4di)__index, __mask, __scale); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mmask_i64gather_epi64(__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { + return (__m128i)__builtin_ia32_gather3div2di( + (__v2di)__v1_old, __addr, (__v2di)__index, __mask, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_i32scatter_ps(void *__addr, __m256i __index, __m256 __v1, + const int __scale) { + __builtin_ia32_scattersiv8sf(__addr, (__mmask8)0xFF, (__v8si)__index, + (__v8sf)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i32scatter_ps(void *__addr, __mmask8 __mask, __m256i __index, + __m256 __v1, const int __scale) { + __builtin_ia32_scattersiv8sf(__addr, __mask, (__v8si)__index, (__v8sf)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i32scatter_ps(void *__addr, __m128i __index, __m128 __v1, + const int __scale) { + __builtin_ia32_scattersiv4sf(__addr, (__mmask8)0xFF, (__v4si)__index, + (__v4sf)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i32scatter_ps(void *__addr, __mmask8 __mask, __m128i __index, + __m128 __v1, const int __scale) { + __builtin_ia32_scattersiv4sf(__addr, __mask, (__v4si)__index, (__v4sf)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_i32scatter_pd(void *__addr, __m128i __index, __m256d __v1, + const int __scale) { + __builtin_ia32_scattersiv4df(__addr, (__mmask8)0xFF, (__v4si)__index, + (__v4df)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i32scatter_pd(void *__addr, __mmask8 __mask, __m128i __index, + __m256d __v1, const int __scale) { + __builtin_ia32_scattersiv4df(__addr, __mask, (__v4si)__index, (__v4df)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i32scatter_pd(void *__addr, __m128i __index, __m128d __v1, + const int __scale) { + __builtin_ia32_scattersiv2df(__addr, (__mmask8)0xFF, (__v4si)__index, + (__v2df)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i32scatter_pd(void *__addr, __mmask8 __mask, __m128i __index, + __m128d __v1, const int __scale) { + __builtin_ia32_scattersiv2df(__addr, __mask, (__v4si)__index, (__v2df)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_i64scatter_ps(void *__addr, __m256i __index, __m128 __v1, + const int __scale) { + __builtin_ia32_scatterdiv8sf(__addr, (__mmask8)0xFF, (__v4di)__index, + (__v4sf)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i64scatter_ps(void *__addr, __mmask8 __mask, __m256i __index, + __m128 __v1, const int __scale) { + __builtin_ia32_scatterdiv8sf(__addr, __mask, (__v4di)__index, (__v4sf)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i64scatter_ps(void *__addr, __m128i __index, __m128 __v1, + const int __scale) { + __builtin_ia32_scatterdiv4sf(__addr, (__mmask8)0xFF, (__v2di)__index, + (__v4sf)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i64scatter_ps(void *__addr, __mmask8 __mask, __m128i __index, + __m128 __v1, const int __scale) { + __builtin_ia32_scatterdiv4sf(__addr, __mask, (__v2di)__index, (__v4sf)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_i64scatter_pd(void *__addr, __m256i __index, __m256d __v1, + const int __scale) { + __builtin_ia32_scatterdiv4df(__addr, (__mmask8)0xFF, (__v4di)__index, + (__v4df)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i64scatter_pd(void *__addr, __mmask8 __mask, __m256i __index, + __m256d __v1, const int __scale) { + __builtin_ia32_scatterdiv4df(__addr, __mask, (__v4di)__index, (__v4df)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i64scatter_pd(void *__addr, __m128i __index, __m128d __v1, + const int __scale) { + __builtin_ia32_scatterdiv2df(__addr, (__mmask8)0xFF, (__v2di)__index, + (__v2df)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i64scatter_pd(void *__addr, __mmask8 __mask, __m128i __index, + __m128d __v1, const int __scale) { + __builtin_ia32_scatterdiv2df(__addr, __mask, (__v2di)__index, (__v2df)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_i32scatter_epi32(void *__addr, __m256i __index, __m256i __v1, + const int __scale) { + __builtin_ia32_scattersiv8si(__addr, (__mmask8)0xFF, (__v8si)__index, + (__v8si)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i32scatter_epi32(void *__addr, __mmask8 __mask, __m256i __index, + __m256i __v1, const int __scale) { + __builtin_ia32_scattersiv8si(__addr, __mask, (__v8si)__index, (__v8si)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i32scatter_epi32(void *__addr, __m128i __index, __m128i __v1, + const int __scale) { + __builtin_ia32_scattersiv4si(__addr, (__mmask8)0xFF, (__v4si)__index, + (__v4si)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i32scatter_epi32(void *__addr, __mmask8 __mask, __m128i __index, + __m128i __v1, const int __scale) { + __builtin_ia32_scattersiv4si(__addr, __mask, (__v4si)__index, (__v4si)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_i32scatter_epi64(void *__addr, __m128i __index, __m256i __v1, + const int __scale) { + __builtin_ia32_scattersiv4di(__addr, (__mmask8)0xFF, (__v4si)__index, + (__v4di)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, __m128i __index, + __m256i __v1, const int __scale) { + __builtin_ia32_scattersiv4di(__addr, __mask, (__v4si)__index, (__v4di)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i32scatter_epi64(void *__addr, __m128i __index, __m128i __v1, + const int __scale) { + __builtin_ia32_scattersiv2di(__addr, (__mmask8)0xFF, (__v4si)__index, + (__v2di)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, __m128i __index, + __m128i __v1, const int __scale) { + __builtin_ia32_scattersiv2di(__addr, __mask, (__v4si)__index, (__v2di)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_i64scatter_epi32(void *__addr, __m256i __index, __m128i __v1, + const int __scale) { + __builtin_ia32_scatterdiv8si(__addr, (__mmask8)0xFF, (__v4di)__index, + (__v4si)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m256i __index, + __m128i __v1, const int __scale) { + __builtin_ia32_scatterdiv8si(__addr, __mask, (__v4di)__index, (__v4si)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i64scatter_epi32(void *__addr, __m128i __index, __m128i __v1, + const int __scale) { + __builtin_ia32_scatterdiv4si(__addr, (__mmask8)0xFF, (__v2di)__index, + (__v4si)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m128i __index, + __m128i __v1, const int __scale) { + __builtin_ia32_scatterdiv4si(__addr, __mask, (__v2di)__index, (__v4si)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_i64scatter_epi64(void *__addr, __m256i __index, __m256i __v1, + const int __scale) { + __builtin_ia32_scatterdiv4di(__addr, (__mmask8)0xFF, (__v4di)__index, + (__v4di)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, __m256i __index, + __m256i __v1, const int __scale) { + __builtin_ia32_scatterdiv4di(__addr, __mask, (__v4di)__index, (__v4di)__v1, + __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_i64scatter_epi64(void *__addr, __m128i __index, __m128i __v1, + const int __scale) { + __builtin_ia32_scatterdiv2di(__addr, (__mmask8)0xFF, (__v2di)__index, + (__v2di)__v1, __scale); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, __m128i __index, + __m128i __v1, const int __scale) { + __builtin_ia32_scatterdiv2di(__addr, __mask, (__v2di)__index, (__v2di)__v1, + __scale); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, __m256i __A, + _MM_PERM_ENUM __mask) { + return (__m256i)__builtin_ia32_pshufd256_mask((__v8si)__A, __mask, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A, _MM_PERM_ENUM __mask) { + return (__m256i)__builtin_ia32_pshufd256_mask( + (__v8si)__A, __mask, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_shuffle_epi32(__m128i __W, __mmask8 __U, __m128i __A, + _MM_PERM_ENUM __mask) { + return (__m128i)__builtin_ia32_pshufd128_mask((__v4si)__A, __mask, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A, _MM_PERM_ENUM __mask) { + return (__m128i)__builtin_ia32_pshufd128_mask( + (__v4si)__A, __mask, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rol_epi32(__m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prold256_mask( + (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_rol_epi32(__m256i __W, __mmask8 __U, __m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prold256_mask((__v8si)__A, __B, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_rol_epi32(__mmask8 __U, __m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prold256_mask( + (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rol_epi32(__m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prold128_mask( + (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rol_epi32(__m128i __W, __mmask8 __U, __m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prold128_mask((__v4si)__A, __B, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rol_epi32(__mmask8 __U, __m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prold128_mask( + (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_ror_epi32(__m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prord256_mask( + (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_ror_epi32(__m256i __W, __mmask8 __U, __m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prord256_mask((__v8si)__A, __B, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_ror_epi32(__mmask8 __U, __m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prord256_mask( + (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ror_epi32(__m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prord128_mask( + (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_ror_epi32(__m128i __W, __mmask8 __U, __m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prord128_mask((__v4si)__A, __B, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_ror_epi32(__mmask8 __U, __m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prord128_mask( + (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rol_epi64(__m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prolq256_mask( + (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_rol_epi64(__m256i __W, __mmask8 __U, __m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prolq256_mask((__v4di)__A, __B, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_rol_epi64(__mmask8 __U, __m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prolq256_mask( + (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rol_epi64(__m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prolq128_mask( + (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_rol_epi64(__m128i __W, __mmask8 __U, __m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prolq128_mask((__v2di)__A, __B, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_rol_epi64(__mmask8 __U, __m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prolq128_mask( + (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_ror_epi64(__m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prorq256_mask( + (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_ror_epi64(__m256i __W, __mmask8 __U, __m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prorq256_mask((__v4di)__A, __B, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_ror_epi64(__mmask8 __U, __m256i __A, const int __B) { + return (__m256i)__builtin_ia32_prorq256_mask( + (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ror_epi64(__m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prorq128_mask( + (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_ror_epi64(__m128i __W, __mmask8 __U, __m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prorq128_mask((__v2di)__A, __B, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_ror_epi64(__mmask8 __U, __m128i __A, const int __B) { + return (__m128i)__builtin_ia32_prorq128_mask( + (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_alignr_epi32(__m128i __A, __m128i __B, const int __imm) { + return (__m128i)__builtin_ia32_alignd128_mask((__v4si)__A, (__v4si)__B, __imm, + (__v4si)_mm_setzero_si128(), + (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_alignr_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B, + const int __imm) { + return (__m128i)__builtin_ia32_alignd128_mask((__v4si)__A, (__v4si)__B, __imm, + (__v4si)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_alignr_epi32(__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) { + return (__m128i)__builtin_ia32_alignd128_mask((__v4si)__A, (__v4si)__B, __imm, + (__v4si)_mm_setzero_si128(), + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_alignr_epi64(__m128i __A, __m128i __B, const int __imm) { + return (__m128i)__builtin_ia32_alignq128_mask((__v2di)__A, (__v2di)__B, __imm, + (__v2di)_mm_setzero_si128(), + (__mmask8)-1); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_alignr_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B, + const int __imm) { + return (__m128i)__builtin_ia32_alignq128_mask((__v2di)__A, (__v2di)__B, __imm, + (__v2di)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_alignr_epi64(__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) { + return (__m128i)__builtin_ia32_alignq128_mask((__v2di)__A, (__v2di)__B, __imm, + (__v2di)_mm_setzero_si128(), + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_alignr_epi32(__m256i __A, __m256i __B, const int __imm) { + return (__m256i)__builtin_ia32_alignd256_mask((__v8si)__A, (__v8si)__B, __imm, + (__v8si)_mm256_setzero_si256(), + (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_alignr_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) { + return (__m256i)__builtin_ia32_alignd256_mask((__v8si)__A, (__v8si)__B, __imm, + (__v8si)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_alignr_epi32(__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) { + return (__m256i)__builtin_ia32_alignd256_mask((__v8si)__A, (__v8si)__B, __imm, + (__v8si)_mm256_setzero_si256(), + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_alignr_epi64(__m256i __A, __m256i __B, const int __imm) { + return (__m256i)__builtin_ia32_alignq256_mask((__v4di)__A, (__v4di)__B, __imm, + (__v4di)_mm256_setzero_si256(), + (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_alignr_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) { + return (__m256i)__builtin_ia32_alignq256_mask((__v4di)__A, (__v4di)__B, __imm, + (__v4di)__W, (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_alignr_epi64(__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) { + return (__m256i)__builtin_ia32_alignq256_mask((__v4di)__A, (__v4di)__B, __imm, + (__v4di)_mm256_setzero_si256(), + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m128 __A, const int __I) { + return (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)__A, __I, (__v8hi)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_cvtps_ph(__mmask8 __U, __m128 __A, const int __I) { + return (__m128i)__builtin_ia32_vcvtps2ph_mask( + (__v4sf)__A, __I, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m256 __A, const int __I) { + return (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)__A, __I, + (__v8hi)__W, (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_cvtps_ph(__mmask8 __U, __m256 __A, const int __I) { + return (__m128i)__builtin_ia32_vcvtps2ph256_mask( + (__v8sf)__A, __I, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) { + return (__m256i)__builtin_ia32_psradi256_mask((__v8si)__A, __imm, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, const int __imm) { + return (__m256i)__builtin_ia32_psradi256_mask( + (__v8si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psradi128_mask((__v4si)__A, __imm, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psradi128_mask( + (__v4si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_srai_epi64(__m256i __A, const int __imm) { + return (__m256i)__builtin_ia32_psraqi256_mask( + (__v4di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) { + return (__m256i)__builtin_ia32_psraqi256_mask((__v4di)__A, __imm, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, const int __imm) { + return (__m256i)__builtin_ia32_psraqi256_mask( + (__v4di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srai_epi64(__m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psraqi128_mask( + (__v2di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)-1); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psraqi128_mask((__v2di)__A, __imm, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, const int __imm) { + return (__m128i)__builtin_ia32_psraqi128_mask( + (__v2di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) { + return (__m128i)__builtin_ia32_pslldi128_mask((__v4si)__A, __B, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B) { + return (__m128i)__builtin_ia32_pslldi128_mask( + (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B) { + return (__m128i)__builtin_ia32_psllqi128_mask((__v2di)__A, __B, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B) { + return (__m128i)__builtin_ia32_psllqi128_mask( + (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) { + return (__m256i)__builtin_ia32_pslldi256_mask((__v8si)__A, __B, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B) { + return (__m256i)__builtin_ia32_pslldi256_mask( + (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B) { + return (__m256i)__builtin_ia32_psllqi256_mask((__v4di)__A, __B, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B) { + return (__m256i)__builtin_ia32_psllqi256_mask( + (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_permutex_pd(__m256d __W, __mmask8 __U, __m256d __X, + const int __imm) { + return (__m256d)__builtin_ia32_permdf256_mask((__v4df)__X, __imm, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permutex_pd(__mmask8 __U, __m256d __X, const int __imm) { + return (__m256d)__builtin_ia32_permdf256_mask( + (__v4df)__X, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X, const int __C) { + return (__m256d)__builtin_ia32_vpermilpd256_mask((__v4df)__X, __C, + (__v4df)__W, (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permute_pd(__mmask8 __U, __m256d __X, const int __C) { + return (__m256d)__builtin_ia32_vpermilpd256_mask( + (__v4df)__X, __C, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_permute_pd(__m128d __W, __mmask8 __U, __m128d __X, const int __C) { + return (__m128d)__builtin_ia32_vpermilpd_mask((__v2df)__X, __C, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_permute_pd(__mmask8 __U, __m128d __X, const int __C) { + return (__m128d)__builtin_ia32_vpermilpd_mask( + (__v2df)__X, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X, const int __C) { + return (__m256)__builtin_ia32_vpermilps256_mask((__v8sf)__X, __C, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_permute_ps(__mmask8 __U, __m256 __X, const int __C) { + return (__m256)__builtin_ia32_vpermilps256_mask( + (__v8sf)__X, __C, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X, const int __C) { + return (__m128)__builtin_ia32_vpermilps_mask((__v4sf)__X, __C, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_permute_ps(__mmask8 __U, __m128 __X, const int __C) { + return (__m128)__builtin_ia32_vpermilps_mask( + (__v4sf)__X, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_blend_pd(__mmask8 __U, __m256d __A, __m256d __W) { + return (__m256d)__builtin_ia32_blendmpd_256_mask((__v4df)__A, (__v4df)__W, + (__mmask8)__U); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_blend_ps(__mmask8 __U, __m256 __A, __m256 __W) { + return (__m256)__builtin_ia32_blendmps_256_mask((__v8sf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_blend_epi64(__mmask8 __U, __m256i __A, __m256i __W) { + return (__m256i)__builtin_ia32_blendmq_256_mask((__v4di)__A, (__v4di)__W, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_blend_epi32(__mmask8 __U, __m256i __A, __m256i __W) { + return (__m256i)__builtin_ia32_blendmd_256_mask((__v8si)__A, (__v8si)__W, + (__mmask8)__U); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_blend_pd(__mmask8 __U, __m128d __A, __m128d __W) { + return (__m128d)__builtin_ia32_blendmpd_128_mask((__v2df)__A, (__v2df)__W, + (__mmask8)__U); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_blend_ps(__mmask8 __U, __m128 __A, __m128 __W) { + return (__m128)__builtin_ia32_blendmps_128_mask((__v4sf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_blend_epi64(__mmask8 __U, __m128i __A, __m128i __W) { + return (__m128i)__builtin_ia32_blendmq_128_mask((__v2di)__A, (__v2di)__W, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) { + return (__m128i)__builtin_ia32_blendmd_128_mask((__v4si)__A, (__v4si)__W, + (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_epi64_mask(__m256i __X, __m256i __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_epi32_mask(__m256i __X, __m256i __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_epu64_mask(__m256i __X, __m256i __Y, const int __P) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_epu32_mask(__m256i __X, __m256i __Y, const int __P) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_pd_mask(__m256d __X, __m256d __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)__X, (__v4df)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_ps_mask(__m256 __X, __m256 __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)__X, (__v8sf)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmp_epi64_mask(__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) { + return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmp_epi32_mask(__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) { + return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmp_epu64_mask(__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) { + return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_cmp_epu32_mask(__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) { + return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_cmp_pd_mask(__mmask8 __U, __m256d __X, __m256d __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)__X, (__v4df)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_cmp_ps_mask(__mmask8 __U, __m256 __X, __m256 __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)__X, (__v8sf)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_epi64_mask(__m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_epi32_mask(__m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_epu64_mask(__m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_epu32_mask(__m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_pd_mask(__m128d __X, __m128d __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)__X, (__v2df)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_ps_mask(__m128 __X, __m128 __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)__X, (__v4sf)__Y, __P, + (__mmask8)-1); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_cmp_epi64_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_cmp_epi32_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_cmp_epu64_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_cmp_epu32_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { + return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_cmp_pd_mask(__mmask8 __U, __m128d __X, __m128d __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)__X, (__v2df)__Y, __P, + (__mmask8)__U); +} + +extern __inline __mmask8 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_cmp_ps_mask(__mmask8 __U, __m128 __X, __m128 __Y, const int __P) { + return (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)__X, (__v4sf)__Y, __P, + (__mmask8)__U); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutex_pd(__m256d __X, const int __M) { + return (__m256d)__builtin_ia32_permdf256_mask( + (__v4df)__X, __M, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); +} + +#else +#define _mm256_permutex_pd(X, M) \ + ((__m256d)__builtin_ia32_permdf256_mask( \ + (__v4df)(__m256d)(X), (int)(M), (__v4df)(__m256d)_mm256_undefined_pd(), \ + (__mmask8)-1)) + +#define _mm256_permutex_epi64(X, I) \ + ((__m256i)__builtin_ia32_permdi256_mask( \ + (__v4di)(__m256i)(X), (int)(I), \ + (__v4di)(__m256i)(_mm256_setzero_si256()), (__mmask8)-1)) + +#define _mm256_maskz_permutex_epi64(M, X, I) \ + ((__m256i)__builtin_ia32_permdi256_mask( \ + (__v4di)(__m256i)(X), (int)(I), \ + (__v4di)(__m256i)(_mm256_setzero_si256()), (__mmask8)(M))) + +#define _mm256_mask_permutex_epi64(W, M, X, I) \ + ((__m256i)__builtin_ia32_permdi256_mask( \ + (__v4di)(__m256i)(X), (int)(I), (__v4di)(__m256i)(W), (__mmask8)(M))) + +#define _mm256_insertf32x4(X, Y, C) \ + ((__m256)__builtin_ia32_insertf32x4_256_mask( \ + (__v8sf)(__m256)(X), (__v4sf)(__m128)(Y), (int)(C), \ + (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)-1)) + +#define _mm256_mask_insertf32x4(W, U, X, Y, C) \ + ((__m256)__builtin_ia32_insertf32x4_256_mask( \ + (__v8sf)(__m256)(X), (__v4sf)(__m128)(Y), (int)(C), (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_insertf32x4(U, X, Y, C) \ + ((__m256)__builtin_ia32_insertf32x4_256_mask( \ + (__v8sf)(__m256)(X), (__v4sf)(__m128)(Y), (int)(C), \ + (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)(U))) + +#define _mm256_inserti32x4(X, Y, C) \ + ((__m256i)__builtin_ia32_inserti32x4_256_mask( \ + (__v8si)(__m256i)(X), (__v4si)(__m128i)(Y), (int)(C), \ + (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)-1)) + +#define _mm256_mask_inserti32x4(W, U, X, Y, C) \ + ((__m256i)__builtin_ia32_inserti32x4_256_mask( \ + (__v8si)(__m256i)(X), (__v4si)(__m128i)(Y), (int)(C), \ + (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_inserti32x4(U, X, Y, C) \ + ((__m256i)__builtin_ia32_inserti32x4_256_mask( \ + (__v8si)(__m256i)(X), (__v4si)(__m128i)(Y), (int)(C), \ + (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) + +#define _mm256_extractf32x4_ps(X, C) \ + ((__m128)__builtin_ia32_extractf32x4_256_mask( \ + (__v8sf)(__m256)(X), (int)(C), (__v4sf)(__m128)_mm_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm256_mask_extractf32x4_ps(W, U, X, C) \ + ((__m128)__builtin_ia32_extractf32x4_256_mask( \ + (__v8sf)(__m256)(X), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm256_maskz_extractf32x4_ps(U, X, C) \ + ((__m128)__builtin_ia32_extractf32x4_256_mask( \ + (__v8sf)(__m256)(X), (int)(C), (__v4sf)(__m128)_mm_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm256_extracti32x4_epi32(X, C) \ + ((__m128i)__builtin_ia32_extracti32x4_256_mask( \ + (__v8si)(__m256i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)-1)) + +#define _mm256_mask_extracti32x4_epi32(W, U, X, C) \ + ((__m128i)__builtin_ia32_extracti32x4_256_mask( \ + (__v8si)(__m256i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm256_maskz_extracti32x4_epi32(U, X, C) \ + ((__m128i)__builtin_ia32_extracti32x4_256_mask( \ + (__v8si)(__m256i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_shuffle_i64x2(X, Y, C) \ + ((__m256i)__builtin_ia32_shuf_i64x2_256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), \ + (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask8)-1)) + +#define _mm256_mask_shuffle_i64x2(W, U, X, Y, C) \ + ((__m256i)__builtin_ia32_shuf_i64x2_256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), \ + (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_shuffle_i64x2(U, X, Y, C) \ + ((__m256i)__builtin_ia32_shuf_i64x2_256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), \ + (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) + +#define _mm256_shuffle_i32x4(X, Y, C) \ + ((__m256i)__builtin_ia32_shuf_i32x4_256_mask( \ + (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), \ + (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)-1)) + +#define _mm256_mask_shuffle_i32x4(W, U, X, Y, C) \ + ((__m256i)__builtin_ia32_shuf_i32x4_256_mask( \ + (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), \ + (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_shuffle_i32x4(U, X, Y, C) \ + ((__m256i)__builtin_ia32_shuf_i32x4_256_mask( \ + (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), \ + (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) + +#define _mm256_shuffle_f64x2(X, Y, C) \ + ((__m256d)__builtin_ia32_shuf_f64x2_256_mask( \ + (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), \ + (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)-1)) + +#define _mm256_mask_shuffle_f64x2(W, U, X, Y, C) \ + ((__m256d)__builtin_ia32_shuf_f64x2_256_mask( \ + (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), \ + (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm256_maskz_shuffle_f64x2(U, X, Y, C) \ + ((__m256d)__builtin_ia32_shuf_f64x2_256_mask( \ + (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(C), \ + (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) + +#define _mm256_shuffle_f32x4(X, Y, C) \ + ((__m256)__builtin_ia32_shuf_f32x4_256_mask( \ + (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), \ + (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)-1)) + +#define _mm256_mask_shuffle_f32x4(W, U, X, Y, C) \ + ((__m256)__builtin_ia32_shuf_f32x4_256_mask( \ + (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_shuffle_f32x4(U, X, Y, C) \ + ((__m256)__builtin_ia32_shuf_f32x4_256_mask( \ + (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(C), \ + (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)(U))) + +#define _mm256_mask_shuffle_pd(W, U, A, B, C) \ + ((__m256d)__builtin_ia32_shufpd256_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm256_maskz_shuffle_pd(U, A, B, C) \ + ((__m256d)__builtin_ia32_shufpd256_mask( \ + (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) + +#define _mm_mask_shuffle_pd(W, U, A, B, C) \ + ((__m128d)__builtin_ia32_shufpd128_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_shuffle_pd(U, A, B, C) \ + ((__m128d)__builtin_ia32_shufpd128_mask( \ + (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)_mm_setzero_pd(), (__mmask8)(U))) + +#define _mm256_mask_shuffle_ps(W, U, A, B, C) \ + ((__m256)__builtin_ia32_shufps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm256_maskz_shuffle_ps(U, A, B, C) \ + ((__m256)__builtin_ia32_shufps256_mask( \ + (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)(U))) + +#define _mm_mask_shuffle_ps(W, U, A, B, C) \ + ((__m128)__builtin_ia32_shufps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_shuffle_ps(U, A, B, C) \ + ((__m128)__builtin_ia32_shufps128_mask( \ + (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)(U))) + +#define _mm256_fixupimm_pd(X, Y, Z, C) \ + ((__m256d)__builtin_ia32_fixupimmpd256_mask( \ + (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), \ + (int)(C), (__mmask8)(-1))) + +#define _mm256_mask_fixupimm_pd(X, U, Y, Z, C) \ + ((__m256d)__builtin_ia32_fixupimmpd256_mask( \ + (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), \ + (int)(C), (__mmask8)(U))) + +#define _mm256_maskz_fixupimm_pd(U, X, Y, Z, C) \ + ((__m256d)__builtin_ia32_fixupimmpd256_maskz( \ + (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (__v4di)(__m256i)(Z), \ + (int)(C), (__mmask8)(U))) + +#define _mm256_fixupimm_ps(X, Y, Z, C) \ + ((__m256)__builtin_ia32_fixupimmps256_mask( \ + (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), \ + (int)(C), (__mmask8)(-1))) + +#define _mm256_mask_fixupimm_ps(X, U, Y, Z, C) \ + ((__m256)__builtin_ia32_fixupimmps256_mask( \ + (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), \ + (int)(C), (__mmask8)(U))) + +#define _mm256_maskz_fixupimm_ps(U, X, Y, Z, C) \ + ((__m256)__builtin_ia32_fixupimmps256_maskz( \ + (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (__v8si)(__m256i)(Z), \ + (int)(C), (__mmask8)(U))) + +#define _mm_fixupimm_pd(X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmpd128_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ + (int)(C), (__mmask8)(-1))) + +#define _mm_mask_fixupimm_pd(X, U, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmpd128_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ + (int)(C), (__mmask8)(U))) + +#define _mm_maskz_fixupimm_pd(U, X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmpd128_maskz( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), \ + (int)(C), (__mmask8)(U))) + +#define _mm_fixupimm_ps(X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmps128_mask( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ + (int)(C), (__mmask8)(-1))) + +#define _mm_mask_fixupimm_ps(X, U, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmps128_mask( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ + (int)(C), (__mmask8)(U))) + +#define _mm_maskz_fixupimm_ps(U, X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmps128_maskz( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), \ + (int)(C), (__mmask8)(U))) + +#define _mm256_mask_srli_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_psrldi256_mask( \ + (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_srli_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_psrldi256_mask((__v8si)(__m256i)(A), (int)(B), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm_mask_srli_epi32(W, U, A, B) \ + ((__m128i)__builtin_ia32_psrldi128_mask( \ + (__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srli_epi32(U, A, B) \ + ((__m128i)__builtin_ia32_psrldi128_mask((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_mask_srli_epi64(W, U, A, B) \ + ((__m256i)__builtin_ia32_psrlqi256_mask( \ + (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_srli_epi64(U, A, B) \ + ((__m256i)__builtin_ia32_psrlqi256_mask((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm_mask_srli_epi64(W, U, A, B) \ + ((__m128i)__builtin_ia32_psrlqi128_mask( \ + (__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srli_epi64(U, A, B) \ + ((__m128i)__builtin_ia32_psrlqi128_mask((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_mask_slli_epi32(W, U, X, C) \ + ((__m256i)__builtin_ia32_pslldi256_mask( \ + (__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_slli_epi32(U, X, C) \ + ((__m256i)__builtin_ia32_pslldi256_mask( \ + (__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm256_mask_slli_epi64(W, U, X, C) \ + ((__m256i)__builtin_ia32_psllqi256_mask( \ + (__v4di)(__m256i)(X), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_slli_epi64(U, X, C) \ + ((__m256i)__builtin_ia32_psllqi256_mask( \ + (__v4di)(__m256i)(X), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm_mask_slli_epi32(W, U, X, C) \ + ((__m128i)__builtin_ia32_pslldi128_mask( \ + (__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_slli_epi32(U, X, C) \ + ((__m128i)__builtin_ia32_pslldi128_mask( \ + (__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm_mask_slli_epi64(W, U, X, C) \ + ((__m128i)__builtin_ia32_psllqi128_mask( \ + (__v2di)(__m128i)(X), (int)(C), (__v2di)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_slli_epi64(U, X, C) \ + ((__m128i)__builtin_ia32_psllqi128_mask( \ + (__v2di)(__m128i)(X), (int)(C), (__v2di)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_ternarylogic_epi64(A, B, C, I) \ + ((__m256i)__builtin_ia32_pternlogq256_mask( \ + (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), \ + (int)(I), (__mmask8)-1)) + +#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I) \ + ((__m256i)__builtin_ia32_pternlogq256_mask( \ + (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), \ + (int)(I), (__mmask8)(U))) + +#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I) \ + ((__m256i)__builtin_ia32_pternlogq256_maskz( \ + (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), \ + (int)(I), (__mmask8)(U))) + +#define _mm256_ternarylogic_epi32(A, B, C, I) \ + ((__m256i)__builtin_ia32_pternlogd256_mask( \ + (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), \ + (int)(I), (__mmask8)-1)) + +#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I) \ + ((__m256i)__builtin_ia32_pternlogd256_mask( \ + (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), \ + (int)(I), (__mmask8)(U))) + +#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I) \ + ((__m256i)__builtin_ia32_pternlogd256_maskz( \ + (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), \ + (int)(I), (__mmask8)(U))) + +#define _mm_ternarylogic_epi64(A, B, C, I) \ + ((__m128i)__builtin_ia32_pternlogq128_mask( \ + (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), \ + (int)(I), (__mmask8)-1)) + +#define _mm_mask_ternarylogic_epi64(A, U, B, C, I) \ + ((__m128i)__builtin_ia32_pternlogq128_mask( \ + (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), \ + (int)(I), (__mmask8)(U))) + +#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I) \ + ((__m128i)__builtin_ia32_pternlogq128_maskz( \ + (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), \ + (int)(I), (__mmask8)(U))) + +#define _mm_ternarylogic_epi32(A, B, C, I) \ + ((__m128i)__builtin_ia32_pternlogd128_mask( \ + (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), \ + (int)(I), (__mmask8)-1)) + +#define _mm_mask_ternarylogic_epi32(A, U, B, C, I) \ + ((__m128i)__builtin_ia32_pternlogd128_mask( \ + (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), \ + (int)(I), (__mmask8)(U))) + +#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I) \ + ((__m128i)__builtin_ia32_pternlogd128_maskz( \ + (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), \ + (int)(I), (__mmask8)(U))) + +#define _mm256_roundscale_ps(A, B) \ + ((__m256)__builtin_ia32_rndscaleps_256_mask( \ + (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)_mm256_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm256_mask_roundscale_ps(W, U, A, B) \ + ((__m256)__builtin_ia32_rndscaleps_256_mask( \ + (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm256_maskz_roundscale_ps(U, A, B) \ + ((__m256)__builtin_ia32_rndscaleps_256_mask( \ + (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)_mm256_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm256_roundscale_pd(A, B) \ + ((__m256d)__builtin_ia32_rndscalepd_256_mask( \ + (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm256_mask_roundscale_pd(W, U, A, B) \ + ((__m256d)__builtin_ia32_rndscalepd_256_mask( \ + (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm256_maskz_roundscale_pd(U, A, B) \ + ((__m256d)__builtin_ia32_rndscalepd_256_mask( \ + (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm_roundscale_ps(A, B) \ + ((__m128)__builtin_ia32_rndscaleps_128_mask( \ + (__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)_mm_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm_mask_roundscale_ps(W, U, A, B) \ + ((__m128)__builtin_ia32_rndscaleps_128_mask( \ + (__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_roundscale_ps(U, A, B) \ + ((__m128)__builtin_ia32_rndscaleps_128_mask( \ + (__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)_mm_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm_roundscale_pd(A, B) \ + ((__m128d)__builtin_ia32_rndscalepd_128_mask( \ + (__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)_mm_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm_mask_roundscale_pd(W, U, A, B) \ + ((__m128d)__builtin_ia32_rndscalepd_128_mask( \ + (__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_roundscale_pd(U, A, B) \ + ((__m128d)__builtin_ia32_rndscalepd_128_mask( \ + (__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)_mm_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm256_getmant_ps(X, B, C) \ + ((__m256)__builtin_ia32_getmantps256_mask( \ + (__v8sf)(__m256)(X), (int)(((C) << 2) | (B)), \ + (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)-1)) + +#define _mm256_mask_getmant_ps(W, U, X, B, C) \ + ((__m256)__builtin_ia32_getmantps256_mask( \ + (__v8sf)(__m256)(X), (int)(((C) << 2) | (B)), (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_getmant_ps(U, X, B, C) \ + ((__m256)__builtin_ia32_getmantps256_mask( \ + (__v8sf)(__m256)(X), (int)(((C) << 2) | (B)), \ + (__v8sf)(__m256)_mm256_setzero_ps(), (__mmask8)(U))) + +#define _mm_getmant_ps(X, B, C) \ + ((__m128)__builtin_ia32_getmantps128_mask( \ + (__v4sf)(__m128)(X), (int)(((C) << 2) | (B)), \ + (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)-1)) + +#define _mm_mask_getmant_ps(W, U, X, B, C) \ + ((__m128)__builtin_ia32_getmantps128_mask( \ + (__v4sf)(__m128)(X), (int)(((C) << 2) | (B)), (__v4sf)(__m128)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_getmant_ps(U, X, B, C) \ + ((__m128)__builtin_ia32_getmantps128_mask( \ + (__v4sf)(__m128)(X), (int)(((C) << 2) | (B)), \ + (__v4sf)(__m128)_mm_setzero_ps(), (__mmask8)(U))) + +#define _mm256_getmant_pd(X, B, C) \ + ((__m256d)__builtin_ia32_getmantpd256_mask( \ + (__v4df)(__m256d)(X), (int)(((C) << 2) | (B)), \ + (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)-1)) + +#define _mm256_mask_getmant_pd(W, U, X, B, C) \ + ((__m256d)__builtin_ia32_getmantpd256_mask( \ + (__v4df)(__m256d)(X), (int)(((C) << 2) | (B)), (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_getmant_pd(U, X, B, C) \ + ((__m256d)__builtin_ia32_getmantpd256_mask( \ + (__v4df)(__m256d)(X), (int)(((C) << 2) | (B)), \ + (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U))) + +#define _mm_getmant_pd(X, B, C) \ + ((__m128d)__builtin_ia32_getmantpd128_mask( \ + (__v2df)(__m128d)(X), (int)(((C) << 2) | (B)), \ + (__v2df)(__m128d)_mm_setzero_pd(), (__mmask8)-1)) + +#define _mm_mask_getmant_pd(W, U, X, B, C) \ + ((__m128d)__builtin_ia32_getmantpd128_mask( \ + (__v2df)(__m128d)(X), (int)(((C) << 2) | (B)), (__v2df)(__m128d)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_getmant_pd(U, X, B, C) \ + ((__m128d)__builtin_ia32_getmantpd128_mask( \ + (__v2df)(__m128d)(X), (int)(((C) << 2) | (B)), \ + (__v2df)(__m128d)_mm_setzero_pd(), (__mmask8)(U))) + +#define _mm256_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256) __builtin_ia32_gather3siv8sf( \ + (__v8sf)(__m256)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128) __builtin_ia32_gather3siv4sf( \ + (__v4sf)(__m128)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm256_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256d) __builtin_ia32_gather3siv4df( \ + (__v4df)(__m256d)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128d) __builtin_ia32_gather3siv2df( \ + (__v2df)(__m128d)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm256_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128) __builtin_ia32_gather3div8sf( \ + (__v4sf)(__m128)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128) __builtin_ia32_gather3div4sf( \ + (__v4sf)(__m128)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm256_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256d) __builtin_ia32_gather3div4df( \ + (__v4df)(__m256d)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128d) __builtin_ia32_gather3div2df( \ + (__v2df)(__m128d)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm256_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gather3siv8si( \ + (__v8si)(__m256i)V1OLD, (void const *)ADDR, (__v8si)(__m256i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128i) __builtin_ia32_gather3siv4si( \ + (__v4si)(__m128i)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm256_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gather3siv4di( \ + (__v4di)(__m256i)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128i) __builtin_ia32_gather3siv2di( \ + (__v2di)(__m128i)V1OLD, (void const *)ADDR, (__v4si)(__m128i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm256_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128i) __builtin_ia32_gather3div8si( \ + (__v4si)(__m128i)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128i) __builtin_ia32_gather3div4si( \ + (__v4si)(__m128i)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm256_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gather3div4di( \ + (__v4di)(__m256i)V1OLD, (void const *)ADDR, (__v4di)(__m256i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128i) __builtin_ia32_gather3div2di( \ + (__v2di)(__m128i)V1OLD, (void const *)ADDR, (__v2di)(__m128i)INDEX, \ + (__mmask8)MASK, (int)SCALE) + +#define _mm256_i32scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8sf((void *)ADDR, (__mmask8)0xFF, \ + (__v8si)(__m256i)INDEX, (__v8sf)(__m256)V1, \ + (int)SCALE) + +#define _mm256_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8sf((void *)ADDR, (__mmask8)MASK, \ + (__v8si)(__m256i)INDEX, (__v8sf)(__m256)V1, \ + (int)SCALE) + +#define _mm_i32scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4sf((void *)ADDR, (__mmask8)0xFF, \ + (__v4si)(__m128i)INDEX, (__v4sf)(__m128)V1, \ + (int)SCALE) + +#define _mm_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4sf((void *)ADDR, (__mmask8)MASK, \ + (__v4si)(__m128i)INDEX, (__v4sf)(__m128)V1, \ + (int)SCALE) + +#define _mm256_i32scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4df((void *)ADDR, (__mmask8)0xFF, \ + (__v4si)(__m128i)INDEX, (__v4df)(__m256d)V1, \ + (int)SCALE) + +#define _mm256_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4df((void *)ADDR, (__mmask8)MASK, \ + (__v4si)(__m128i)INDEX, (__v4df)(__m256d)V1, \ + (int)SCALE) + +#define _mm_i32scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv2df((void *)ADDR, (__mmask8)0xFF, \ + (__v4si)(__m128i)INDEX, (__v2df)(__m128d)V1, \ + (int)SCALE) + +#define _mm_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv2df((void *)ADDR, (__mmask8)MASK, \ + (__v4si)(__m128i)INDEX, (__v2df)(__m128d)V1, \ + (int)SCALE) + +#define _mm256_i64scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8sf((void *)ADDR, (__mmask8)0xFF, \ + (__v4di)(__m256i)INDEX, (__v4sf)(__m128)V1, \ + (int)SCALE) + +#define _mm256_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8sf((void *)ADDR, (__mmask8)MASK, \ + (__v4di)(__m256i)INDEX, (__v4sf)(__m128)V1, \ + (int)SCALE) + +#define _mm_i64scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4sf((void *)ADDR, (__mmask8)0xFF, \ + (__v2di)(__m128i)INDEX, (__v4sf)(__m128)V1, \ + (int)SCALE) + +#define _mm_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4sf((void *)ADDR, (__mmask8)MASK, \ + (__v2di)(__m128i)INDEX, (__v4sf)(__m128)V1, \ + (int)SCALE) + +#define _mm256_i64scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4df((void *)ADDR, (__mmask8)0xFF, \ + (__v4di)(__m256i)INDEX, (__v4df)(__m256d)V1, \ + (int)SCALE) + +#define _mm256_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4df((void *)ADDR, (__mmask8)MASK, \ + (__v4di)(__m256i)INDEX, (__v4df)(__m256d)V1, \ + (int)SCALE) + +#define _mm_i64scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv2df((void *)ADDR, (__mmask8)0xFF, \ + (__v2di)(__m128i)INDEX, (__v2df)(__m128d)V1, \ + (int)SCALE) + +#define _mm_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv2df((void *)ADDR, (__mmask8)MASK, \ + (__v2di)(__m128i)INDEX, (__v2df)(__m128d)V1, \ + (int)SCALE) + +#define _mm256_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8si((void *)ADDR, (__mmask8)0xFF, \ + (__v8si)(__m256i)INDEX, (__v8si)(__m256i)V1, \ + (int)SCALE) + +#define _mm256_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8si((void *)ADDR, (__mmask8)MASK, \ + (__v8si)(__m256i)INDEX, (__v8si)(__m256i)V1, \ + (int)SCALE) + +#define _mm_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4si((void *)ADDR, (__mmask8)0xFF, \ + (__v4si)(__m128i)INDEX, (__v4si)(__m128i)V1, \ + (int)SCALE) + +#define _mm_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4si((void *)ADDR, (__mmask8)MASK, \ + (__v4si)(__m128i)INDEX, (__v4si)(__m128i)V1, \ + (int)SCALE) + +#define _mm256_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4di((void *)ADDR, (__mmask8)0xFF, \ + (__v4si)(__m128i)INDEX, (__v4di)(__m256i)V1, \ + (int)SCALE) + +#define _mm256_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4di((void *)ADDR, (__mmask8)MASK, \ + (__v4si)(__m128i)INDEX, (__v4di)(__m256i)V1, \ + (int)SCALE) + +#define _mm_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv2di((void *)ADDR, (__mmask8)0xFF, \ + (__v4si)(__m128i)INDEX, (__v2di)(__m128i)V1, \ + (int)SCALE) + +#define _mm_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv2di((void *)ADDR, (__mmask8)MASK, \ + (__v4si)(__m128i)INDEX, (__v2di)(__m128i)V1, \ + (int)SCALE) + +#define _mm256_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8si((void *)ADDR, (__mmask8)0xFF, \ + (__v4di)(__m256i)INDEX, (__v4si)(__m128i)V1, \ + (int)SCALE) + +#define _mm256_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8si((void *)ADDR, (__mmask8)MASK, \ + (__v4di)(__m256i)INDEX, (__v4si)(__m128i)V1, \ + (int)SCALE) + +#define _mm_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4si((void *)ADDR, (__mmask8)0xFF, \ + (__v2di)(__m128i)INDEX, (__v4si)(__m128i)V1, \ + (int)SCALE) + +#define _mm_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4si((void *)ADDR, (__mmask8)MASK, \ + (__v2di)(__m128i)INDEX, (__v4si)(__m128i)V1, \ + (int)SCALE) + +#define _mm256_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4di((void *)ADDR, (__mmask8)0xFF, \ + (__v4di)(__m256i)INDEX, (__v4di)(__m256i)V1, \ + (int)SCALE) + +#define _mm256_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4di((void *)ADDR, (__mmask8)MASK, \ + (__v4di)(__m256i)INDEX, (__v4di)(__m256i)V1, \ + (int)SCALE) + +#define _mm_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv2di((void *)ADDR, (__mmask8)0xFF, \ + (__v2di)(__m128i)INDEX, (__v2di)(__m128i)V1, \ + (int)SCALE) + +#define _mm_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv2di((void *)ADDR, (__mmask8)MASK, \ + (__v2di)(__m128i)INDEX, (__v2di)(__m128i)V1, \ + (int)SCALE) + +#define _mm256_mask_shuffle_epi32(W, U, X, C) \ + ((__m256i)__builtin_ia32_pshufd256_mask( \ + (__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_shuffle_epi32(U, X, C) \ + ((__m256i)__builtin_ia32_pshufd256_mask( \ + (__v8si)(__m256i)(X), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm_mask_shuffle_epi32(W, U, X, C) \ + ((__m128i)__builtin_ia32_pshufd128_mask( \ + (__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_shuffle_epi32(U, X, C) \ + ((__m128i)__builtin_ia32_pshufd128_mask( \ + (__v4si)(__m128i)(X), (int)(C), (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_rol_epi64(A, B) \ + ((__m256i)__builtin_ia32_prolq256_mask( \ + (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)-1)) + +#define _mm256_mask_rol_epi64(W, U, A, B) \ + ((__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_rol_epi64(U, A, B) \ + ((__m256i)__builtin_ia32_prolq256_mask( \ + (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm_rol_epi64(A, B) \ + ((__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)_mm_setzero_si128(), \ + (__mmask8)-1)) + +#define _mm_mask_rol_epi64(W, U, A, B) \ + ((__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_rol_epi64(U, A, B) \ + ((__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_ror_epi64(A, B) \ + ((__m256i)__builtin_ia32_prorq256_mask( \ + (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)-1)) + +#define _mm256_mask_ror_epi64(W, U, A, B) \ + ((__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_ror_epi64(U, A, B) \ + ((__m256i)__builtin_ia32_prorq256_mask( \ + (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm_ror_epi64(A, B) \ + ((__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)_mm_setzero_si128(), \ + (__mmask8)-1)) + +#define _mm_mask_ror_epi64(W, U, A, B) \ + ((__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_ror_epi64(U, A, B) \ + ((__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_rol_epi32(A, B) \ + ((__m256i)__builtin_ia32_prold256_mask( \ + (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)-1)) + +#define _mm256_mask_rol_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(A), (int)(B), \ + (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_rol_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_prold256_mask( \ + (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm_rol_epi32(A, B) \ + ((__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)-1)) + +#define _mm_mask_rol_epi32(W, U, A, B) \ + ((__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_rol_epi32(U, A, B) \ + ((__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_ror_epi32(A, B) \ + ((__m256i)__builtin_ia32_prord256_mask( \ + (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)-1)) + +#define _mm256_mask_ror_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \ + (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_ror_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_prord256_mask( \ + (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm_ror_epi32(A, B) \ + ((__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)-1)) + +#define _mm_mask_ror_epi32(W, U, A, B) \ + ((__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_ror_epi32(U, A, B) \ + ((__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_alignr_epi32(X, Y, C) \ + ((__m256i)__builtin_ia32_alignd256_mask((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(C), \ + (__v8si)(__m256i)(X), (__mmask8)-1)) + +#define _mm256_mask_alignr_epi32(W, U, X, Y, C) \ + ((__m256i)__builtin_ia32_alignd256_mask( \ + (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), \ + (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_alignr_epi32(U, X, Y, C) \ + ((__m256i)__builtin_ia32_alignd256_mask( \ + (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(C), \ + (__v8si)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) + +#define _mm256_alignr_epi64(X, Y, C) \ + ((__m256i)__builtin_ia32_alignq256_mask((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(C), \ + (__v4di)(__m256i)(X), (__mmask8)-1)) + +#define _mm256_mask_alignr_epi64(W, U, X, Y, C) \ + ((__m256i)__builtin_ia32_alignq256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), \ + (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_alignr_epi64(U, X, Y, C) \ + ((__m256i)__builtin_ia32_alignq256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(C), \ + (__v4di)(__m256i)_mm256_setzero_si256(), (__mmask8)(U))) + +#define _mm_alignr_epi32(X, Y, C) \ + ((__m128i)__builtin_ia32_alignd128_mask((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(C), \ + (__v4si)(__m128i)(X), (__mmask8)-1)) + +#define _mm_mask_alignr_epi32(W, U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignd128_mask( \ + (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(C), \ + (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_alignr_epi32(U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignd128_mask( \ + (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(C), \ + (__v4si)(__m128i)_mm_setzero_si128(), (__mmask8)(U))) + +#define _mm_alignr_epi64(X, Y, C) \ + ((__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(C), \ + (__v2di)(__m128i)(X), (__mmask8)-1)) + +#define _mm_mask_alignr_epi64(W, U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignq128_mask((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(C), \ + (__v2di)(__m128i)(X), (__mmask8)-1)) + +#define _mm_maskz_alignr_epi64(U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignq128_mask( \ + (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(C), \ + (__v2di)(__m128i)_mm_setzero_si128(), (__mmask8)(U))) + +#define _mm_mask_cvtps_ph(W, U, A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph_mask( \ + (__v4sf)(__m128)A, (int)(I), (__v8hi)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_cvtps_ph(U, A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph_mask( \ + (__v4sf)(__m128)A, (int)(I), (__v8hi)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_mask_cvtps_ph(W, U, A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph256_mask( \ + (__v8sf)(__m256)A, (int)(I), (__v8hi)(__m128i)(W), (__mmask8)(U))) + +#define _mm256_maskz_cvtps_ph(U, A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph256_mask( \ + (__v8sf)(__m256)A, (int)(I), (__v8hi)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_mask_srai_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_psradi256_mask( \ + (__v8si)(__m256i)(A), (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_srai_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_psradi256_mask((__v8si)(__m256i)(A), (int)(B), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm_mask_srai_epi32(W, U, A, B) \ + ((__m128i)__builtin_ia32_psradi128_mask( \ + (__v4si)(__m128i)(A), (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srai_epi32(U, A, B) \ + ((__m128i)__builtin_ia32_psradi128_mask((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_srai_epi64(A, B) \ + ((__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)_mm256_setzero_si256(), \ + (__mmask8)-1)) + +#define _mm256_mask_srai_epi64(W, U, A, B) \ + ((__m256i)__builtin_ia32_psraqi256_mask( \ + (__v4di)(__m256i)(A), (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_srai_epi64(U, A, B) \ + ((__m256i)__builtin_ia32_psraqi256_mask((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm_srai_epi64(A, B) \ + ((__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)-1)) + +#define _mm_mask_srai_epi64(W, U, A, B) \ + ((__m128i)__builtin_ia32_psraqi128_mask( \ + (__v2di)(__m128i)(A), (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srai_epi64(U, A, B) \ + ((__m128i)__builtin_ia32_psraqi128_mask((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_mask_permutex_pd(W, U, A, B) \ + ((__m256d)__builtin_ia32_permdf256_mask( \ + (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm256_maskz_permutex_pd(U, A, B) \ + ((__m256d)__builtin_ia32_permdf256_mask( \ + (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm256_mask_permute_pd(W, U, X, C) \ + ((__m256d)__builtin_ia32_vpermilpd256_mask( \ + (__v4df)(__m256d)(X), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm256_maskz_permute_pd(U, X, C) \ + ((__m256d)__builtin_ia32_vpermilpd256_mask( \ + (__v4df)(__m256d)(X), (int)(C), (__v4df)(__m256d)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm256_mask_permute_ps(W, U, X, C) \ + ((__m256)__builtin_ia32_vpermilps256_mask( \ + (__v8sf)(__m256)(X), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm256_maskz_permute_ps(U, X, C) \ + ((__m256)__builtin_ia32_vpermilps256_mask( \ + (__v8sf)(__m256)(X), (int)(C), (__v8sf)(__m256)_mm256_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm_mask_permute_pd(W, U, X, C) \ + ((__m128d)__builtin_ia32_vpermilpd_mask( \ + (__v2df)(__m128d)(X), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_permute_pd(U, X, C) \ + ((__m128d)__builtin_ia32_vpermilpd_mask((__v2df)(__m128d)(X), (int)(C), \ + (__v2df)(__m128d)_mm_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm_mask_permute_ps(W, U, X, C) \ + ((__m128)__builtin_ia32_vpermilps_mask((__v4sf)(__m128)(X), (int)(C), \ + (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_permute_ps(U, X, C) \ + ((__m128)__builtin_ia32_vpermilps_mask((__v4sf)(__m128)(X), (int)(C), \ + (__v4sf)(__m128)_mm_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm256_mask_blend_pd(__U, __A, __W) \ + ((__m256d)__builtin_ia32_blendmpd_256_mask((__v4df)(__A), (__v4df)(__W), \ + (__mmask8)(__U))) + +#define _mm256_mask_blend_ps(__U, __A, __W) \ + ((__m256)__builtin_ia32_blendmps_256_mask((__v8sf)(__A), (__v8sf)(__W), \ + (__mmask8)(__U))) + +#define _mm256_mask_blend_epi64(__U, __A, __W) \ + ((__m256i)__builtin_ia32_blendmq_256_mask((__v4di)(__A), (__v4di)(__W), \ + (__mmask8)(__U))) + +#define _mm256_mask_blend_epi32(__U, __A, __W) \ + ((__m256i)__builtin_ia32_blendmd_256_mask((__v8si)(__A), (__v8si)(__W), \ + (__mmask8)(__U))) + +#define _mm_mask_blend_pd(__U, __A, __W) \ + ((__m128d)__builtin_ia32_blendmpd_128_mask((__v2df)(__A), (__v2df)(__W), \ + (__mmask8)(__U))) + +#define _mm_mask_blend_ps(__U, __A, __W) \ + ((__m128)__builtin_ia32_blendmps_128_mask((__v4sf)(__A), (__v4sf)(__W), \ + (__mmask8)(__U))) + +#define _mm_mask_blend_epi64(__U, __A, __W) \ + ((__m128i)__builtin_ia32_blendmq_128_mask((__v2di)(__A), (__v2di)(__W), \ + (__mmask8)(__U))) + +#define _mm_mask_blend_epi32(__U, __A, __W) \ + ((__m128i)__builtin_ia32_blendmd_128_mask((__v4si)(__A), (__v4si)(__W), \ + (__mmask8)(__U))) + +#define _mm256_cmp_epu32_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpd256_mask( \ + (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)-1)) + +#define _mm256_cmp_epi64_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpq256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)-1)) + +#define _mm256_cmp_epi32_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpd256_mask( \ + (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)-1)) + +#define _mm256_cmp_epu64_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpq256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)-1)) + +#define _mm256_cmp_pd_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmppd256_mask( \ + (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(P), (__mmask8)-1)) + +#define _mm256_cmp_ps_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpps256_mask( \ + (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(P), (__mmask8)-1)) + +#define _mm256_mask_cmp_epi64_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpq256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)(M))) + +#define _mm256_mask_cmp_epi32_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpd256_mask( \ + (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)(M))) + +#define _mm256_mask_cmp_epu64_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpq256_mask( \ + (__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(P), (__mmask8)(M))) + +#define _mm256_mask_cmp_epu32_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpd256_mask( \ + (__v8si)(__m256i)(X), (__v8si)(__m256i)(Y), (int)(P), (__mmask8)(M))) + +#define _mm256_mask_cmp_pd_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmppd256_mask( \ + (__v4df)(__m256d)(X), (__v4df)(__m256d)(Y), (int)(P), (__mmask8)(M))) + +#define _mm256_mask_cmp_ps_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpps256_mask( \ + (__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), (int)(P), (__mmask8)(M))) + +#define _mm_cmp_epi64_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpq128_mask( \ + (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)-1)) + +#define _mm_cmp_epi32_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpd128_mask( \ + (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)-1)) + +#define _mm_cmp_epu64_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpq128_mask( \ + (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)-1)) + +#define _mm_cmp_epu32_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpd128_mask( \ + (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)-1)) + +#define _mm_cmp_pd_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmppd128_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)-1)) + +#define _mm_cmp_ps_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpps128_mask( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)-1)) + +#define _mm_mask_cmp_epi64_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpq128_mask( \ + (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)(M))) + +#define _mm_mask_cmp_epi32_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpd128_mask( \ + (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)(M))) + +#define _mm_mask_cmp_epu64_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpq128_mask( \ + (__v2di)(__m128i)(X), (__v2di)(__m128i)(Y), (int)(P), (__mmask8)(M))) + +#define _mm_mask_cmp_epu32_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_ucmpd128_mask( \ + (__v4si)(__m128i)(X), (__v4si)(__m128i)(Y), (int)(P), (__mmask8)(M))) + +#define _mm_mask_cmp_pd_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmppd128_mask( \ + (__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), (int)(P), (__mmask8)(M))) + +#define _mm_mask_cmp_ps_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpps128_mask( \ + (__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (int)(P), (__mmask8)(M))) + +#endif + +#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A)) + +#ifdef __DISABLE_AVX512VL__ +#undef __DISABLE_AVX512VL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VL__ */ + +#endif /* _AVX512VLINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512vnniintrin.internal.h b/third_party/intel/avx512vnniintrin.internal.h new file mode 100644 index 000000000..635b03afe --- /dev/null +++ b/third_party/intel/avx512vnniintrin.internal.h @@ -0,0 +1,109 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VNNIINTRIN_H_INCLUDED +#define __AVX512VNNIINTRIN_H_INCLUDED + +#if !defined(__AVX512VNNI__) +#pragma GCC push_options +#pragma GCC target("avx512vnni") +#define __DISABLE_AVX512VNNI__ +#endif /* __AVX512VNNI__ */ + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_dpbusd_epi32(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vpdpbusd_v16si((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_dpbusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpdpbusd_v16si_mask( + (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_dpbusd_epi32(__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) { + return (__m512i)__builtin_ia32_vpdpbusd_v16si_maskz( + (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_dpbusds_epi32(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vpdpbusds_v16si((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_dpbusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D) { + return (__m512i)__builtin_ia32_vpdpbusds_v16si_mask( + (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_dpbusds_epi32(__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) { + return (__m512i)__builtin_ia32_vpdpbusds_v16si_maskz( + (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_dpwssd_epi32(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwssd_v16si((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm512_mask_dpwssd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { + return (__m512i)__builtin_ia32_vpdpwssd_v16si_mask( + (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_dpwssd_epi32(__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) { + return (__m512i)__builtin_ia32_vpdpwssd_v16si_maskz( + (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_dpwssds_epi32(__m512i __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vpdpwssds_v16si((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_dpwssds_epi32(__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D) { + return (__m512i)__builtin_ia32_vpdpwssds_v16si_mask( + (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_dpwssds_epi32(__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) { + return (__m512i)__builtin_ia32_vpdpwssds_v16si_maskz( + (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); +} + +#ifdef __DISABLE_AVX512VNNI__ +#undef __DISABLE_AVX512VNNI__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VNNI__ */ + +#endif /* __AVX512VNNIINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512vnnivlintrin.internal.h b/third_party/intel/avx512vnnivlintrin.internal.h new file mode 100644 index 000000000..48fb03c5e --- /dev/null +++ b/third_party/intel/avx512vnnivlintrin.internal.h @@ -0,0 +1,188 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include instead." +#endif + +#ifndef _AVX512VNNIVLINTRIN_H_INCLUDED +#define _AVX512VNNIVLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512VNNI__) +#pragma GCC push_options +#pragma GCC target("avx512vnni,avx512vl") +#define __DISABLE_AVX512VNNIVL__ +#endif /* __AVX512VNNIVL__ */ + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_dpbusd_epi32(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vpdpbusd_v8si((__v8si)__A, (__v8si)__B, + (__v8si)__C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_dpbusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpdpbusd_v8si_mask((__v8si)__A, (__v8si)__C, + (__v8si)__D, (__mmask8)__B); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_dpbusd_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpdpbusd_v8si_maskz( + (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_dpbusd_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpdpbusd_v4si((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_dpbusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpdpbusd_v4si_mask((__v4si)__A, (__v4si)__C, + (__v4si)__D, (__mmask8)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_dpbusd_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpdpbusd_v4si_maskz( + (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_dpbusds_epi32(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vpdpbusds_v8si((__v8si)__A, (__v8si)__B, + (__v8si)__C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_dpbusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpdpbusds_v8si_mask( + (__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_dpbusds_epi32(__mmask8 __A, __m256i __B, __m256i __C, + __m256i __D) { + return (__m256i)__builtin_ia32_vpdpbusds_v8si_maskz( + (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_dpbusds_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpdpbusds_v4si((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_dpbusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpdpbusds_v4si_mask( + (__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_dpbusds_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpdpbusds_v4si_maskz( + (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_dpwssd_epi32(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vpdpwssd_v8si((__v8si)__A, (__v8si)__B, + (__v8si)__C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_dpwssd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpdpwssd_v8si_mask((__v8si)__A, (__v8si)__C, + (__v8si)__D, (__mmask8)__B); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_maskz_dpwssd_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpdpwssd_v8si_maskz( + (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_dpwssd_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpdpwssd_v4si((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_dpwssd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpdpwssd_v4si_mask((__v4si)__A, (__v4si)__C, + (__v4si)__D, (__mmask8)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_dpwssd_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpdpwssd_v4si_maskz( + (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_dpwssds_epi32(__m256i __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vpdpwssds_v8si((__v8si)__A, (__v8si)__B, + (__v8si)__C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_mask_dpwssds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { + return (__m256i)__builtin_ia32_vpdpwssds_v8si_mask( + (__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_dpwssds_epi32(__mmask8 __A, __m256i __B, __m256i __C, + __m256i __D) { + return (__m256i)__builtin_ia32_vpdpwssds_v8si_maskz( + (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_dpwssds_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpdpwssds_v4si((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_dpwssds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpdpwssds_v4si_mask( + (__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_maskz_dpwssds_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vpdpwssds_v4si_maskz( + (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); +} +#ifdef __DISABLE_AVX512VNNIVL__ +#undef __DISABLE_AVX512VNNIVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VNNIVL__ */ +#endif /* __DISABLE_AVX512VNNIVL__ */ diff --git a/third_party/intel/avx512vpopcntdqintrin.internal.h b/third_party/intel/avx512vpopcntdqintrin.internal.h new file mode 100644 index 000000000..cc7bc6e12 --- /dev/null +++ b/third_party/intel/avx512vpopcntdqintrin.internal.h @@ -0,0 +1,60 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include instead." +#endif + +#ifndef _AVX512VPOPCNTDQINTRIN_H_INCLUDED +#define _AVX512VPOPCNTDQINTRIN_H_INCLUDED + +#ifndef __AVX512VPOPCNTDQ__ +#pragma GCC push_options +#pragma GCC target("avx512vpopcntdq") +#define __DISABLE_AVX512VPOPCNTDQ__ +#endif /* __AVX512VPOPCNTDQ__ */ + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_popcnt_epi32(__m512i __A) { + return (__m512i)__builtin_ia32_vpopcountd_v16si((__v16si)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_popcnt_epi32(__m512i __A, __mmask16 __U, __m512i __B) { + return (__m512i)__builtin_ia32_vpopcountd_v16si_mask( + (__v16si)__A, (__v16si)__B, (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vpopcountd_v16si_mask( + (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_popcnt_epi64(__m512i __A) { + return (__m512i)__builtin_ia32_vpopcountq_v8di((__v8di)__A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_popcnt_epi64(__m512i __A, __mmask8 __U, __m512i __B) { + return (__m512i)__builtin_ia32_vpopcountq_v8di_mask((__v8di)__A, (__v8di)__B, + (__mmask8)__U); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_vpopcountq_v8di_mask( + (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); +} + +#ifdef __DISABLE_AVX512VPOPCNTDQ__ +#undef __DISABLE_AVX512VPOPCNTDQ__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VPOPCNTDQ__ */ + +#endif /* _AVX512VPOPCNTDQINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avx512vpopcntdqvlintrin.internal.h b/third_party/intel/avx512vpopcntdqvlintrin.internal.h new file mode 100644 index 000000000..4dc14e009 --- /dev/null +++ b/third_party/intel/avx512vpopcntdqvlintrin.internal.h @@ -0,0 +1,100 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include instead." +#endif + +#ifndef _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED +#define _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED + +#if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512VL__) +#pragma GCC push_options +#pragma GCC target("avx512vpopcntdq,avx512vl") +#define __DISABLE_AVX512VPOPCNTDQVL__ +#endif /* __AVX512VPOPCNTDQVL__ */ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_popcnt_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_vpopcountd_v4si((__v4si)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_popcnt_epi32(__m128i __A, __mmask16 __U, __m128i __B) { + return (__m128i)__builtin_ia32_vpopcountd_v4si_mask((__v4si)__A, (__v4si)__B, + (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_popcnt_epi32(__mmask16 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vpopcountd_v4si_mask( + (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_popcnt_epi32(__m256i __A) { + return (__m256i)__builtin_ia32_vpopcountd_v8si((__v8si)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_popcnt_epi32(__m256i __A, __mmask16 __U, __m256i __B) { + return (__m256i)__builtin_ia32_vpopcountd_v8si_mask((__v8si)__A, (__v8si)__B, + (__mmask16)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_popcnt_epi32(__mmask16 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vpopcountd_v8si_mask( + (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask16)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_popcnt_epi64(__m128i __A) { + return (__m128i)__builtin_ia32_vpopcountq_v2di((__v2di)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_popcnt_epi64(__m128i __A, __mmask8 __U, __m128i __B) { + return (__m128i)__builtin_ia32_vpopcountq_v2di_mask((__v2di)__A, (__v2di)__B, + (__mmask8)__U); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_vpopcountq_v2di_mask( + (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_popcnt_epi64(__m256i __A) { + return (__m256i)__builtin_ia32_vpopcountq_v4di((__v4di)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_popcnt_epi64(__m256i __A, __mmask8 __U, __m256i __B) { + return (__m256i)__builtin_ia32_vpopcountq_v4di_mask((__v4di)__A, (__v4di)__B, + (__mmask8)__U); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_vpopcountq_v4di_mask( + (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +#ifdef __DISABLE_AVX512VPOPCNTDQVL__ +#undef __DISABLE_AVX512VPOPCNTDQVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VPOPCNTDQVL__ */ + +#endif /* _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED */ diff --git a/third_party/intel/avxintrin.internal.h b/third_party/intel/avxintrin.internal.h new file mode 100644 index 000000000..93542c8d4 --- /dev/null +++ b/third_party/intel/avxintrin.internal.h @@ -0,0 +1,1374 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVXINTRIN_H_INCLUDED +#define _AVXINTRIN_H_INCLUDED + +#ifndef __AVX__ +#pragma GCC push_options +#pragma GCC target("avx") +#define __DISABLE_AVX__ +#endif /* __AVX__ */ + +typedef double __v4df __attribute__((__vector_size__(32))); +typedef float __v8sf __attribute__((__vector_size__(32))); +typedef long long __v4di __attribute__((__vector_size__(32))); +typedef unsigned long long __v4du __attribute__((__vector_size__(32))); +typedef int __v8si __attribute__((__vector_size__(32))); +typedef unsigned int __v8su __attribute__((__vector_size__(32))); +typedef short __v16hi __attribute__((__vector_size__(32))); +typedef unsigned short __v16hu __attribute__((__vector_size__(32))); +typedef char __v32qi __attribute__((__vector_size__(32))); +typedef unsigned char __v32qu __attribute__((__vector_size__(32))); + +typedef float __m256 __attribute__((__vector_size__(32), __may_alias__)); +typedef long long __m256i __attribute__((__vector_size__(32), __may_alias__)); +typedef double __m256d __attribute__((__vector_size__(32), __may_alias__)); + +typedef float __m256_u + __attribute__((__vector_size__(32), __may_alias__, __aligned__(1))); +typedef long long __m256i_u + __attribute__((__vector_size__(32), __may_alias__, __aligned__(1))); +typedef double __m256d_u + __attribute__((__vector_size__(32), __may_alias__, __aligned__(1))); + +#define _CMP_EQ_OQ 0x00 +#define _CMP_LT_OS 0x01 +#define _CMP_LE_OS 0x02 +#define _CMP_UNORD_Q 0x03 +#define _CMP_NEQ_UQ 0x04 +#define _CMP_NLT_US 0x05 +#define _CMP_NLE_US 0x06 +#define _CMP_ORD_Q 0x07 +#define _CMP_EQ_UQ 0x08 +#define _CMP_NGE_US 0x09 +#define _CMP_NGT_US 0x0a +#define _CMP_FALSE_OQ 0x0b +#define _CMP_NEQ_OQ 0x0c +#define _CMP_GE_OS 0x0d +#define _CMP_GT_OS 0x0e +#define _CMP_TRUE_UQ 0x0f +#define _CMP_EQ_OS 0x10 +#define _CMP_LT_OQ 0x11 +#define _CMP_LE_OQ 0x12 +#define _CMP_UNORD_S 0x13 +#define _CMP_NEQ_US 0x14 +#define _CMP_NLT_UQ 0x15 +#define _CMP_NLE_UQ 0x16 +#define _CMP_ORD_S 0x17 +#define _CMP_EQ_US 0x18 +#define _CMP_NGE_UQ 0x19 +#define _CMP_NGT_UQ 0x1a +#define _CMP_FALSE_OS 0x1b +#define _CMP_NEQ_OS 0x1c +#define _CMP_GE_OQ 0x1d +#define _CMP_GT_OQ 0x1e +#define _CMP_TRUE_US 0x1f + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_add_pd(__m256d __A, __m256d __B) { + return (__m256d)((__v4df)__A + (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_add_ps(__m256 __A, __m256 __B) { + return (__m256)((__v8sf)__A + (__v8sf)__B); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_addsub_pd(__m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_addsubpd256((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_addsub_ps(__m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_addsubps256((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_and_pd(__m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_andpd256((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_and_ps(__m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_andps256((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_andnot_pd(__m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_andnpd256((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_andnot_ps(__m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_andnps256((__v8sf)__A, (__v8sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_blend_pd(__m256d __X, __m256d __Y, const int __M) { + return (__m256d)__builtin_ia32_blendpd256((__v4df)__X, (__v4df)__Y, __M); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_blend_ps(__m256 __X, __m256 __Y, const int __M) { + return (__m256)__builtin_ia32_blendps256((__v8sf)__X, (__v8sf)__Y, __M); +} +#else +#define _mm256_blend_pd(X, Y, M) \ + ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(M))) + +#define _mm256_blend_ps(X, Y, M) \ + ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), \ + (int)(M))) +#endif + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_blendv_pd(__m256d __X, __m256d __Y, __m256d __M) { + return (__m256d)__builtin_ia32_blendvpd256((__v4df)__X, (__v4df)__Y, + (__v4df)__M); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_blendv_ps(__m256 __X, __m256 __Y, __m256 __M) { + return (__m256)__builtin_ia32_blendvps256((__v8sf)__X, (__v8sf)__Y, + (__v8sf)__M); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_div_pd(__m256d __A, __m256d __B) { + return (__m256d)((__v4df)__A / (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_div_ps(__m256 __A, __m256 __B) { + return (__m256)((__v8sf)__A / (__v8sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_dp_ps(__m256 __X, __m256 __Y, const int __M) { + return (__m256)__builtin_ia32_dpps256((__v8sf)__X, (__v8sf)__Y, __M); +} +#else +#define _mm256_dp_ps(X, Y, M) \ + ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), \ + (int)(M))) +#endif + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_hadd_pd(__m256d __X, __m256d __Y) { + return (__m256d)__builtin_ia32_haddpd256((__v4df)__X, (__v4df)__Y); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_hadd_ps(__m256 __X, __m256 __Y) { + return (__m256)__builtin_ia32_haddps256((__v8sf)__X, (__v8sf)__Y); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_hsub_pd(__m256d __X, __m256d __Y) { + return (__m256d)__builtin_ia32_hsubpd256((__v4df)__X, (__v4df)__Y); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_hsub_ps(__m256 __X, __m256 __Y) { + return (__m256)__builtin_ia32_hsubps256((__v8sf)__X, (__v8sf)__Y); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_max_pd(__m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_maxpd256((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_max_ps(__m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_maxps256((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_min_pd(__m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_minpd256((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_min_ps(__m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_minps256((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mul_pd(__m256d __A, __m256d __B) { + return (__m256d)((__v4df)__A * (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mul_ps(__m256 __A, __m256 __B) { + return (__m256)((__v8sf)__A * (__v8sf)__B); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_or_pd(__m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_orpd256((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_or_ps(__m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_orps256((__v8sf)__A, (__v8sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shuffle_pd(__m256d __A, __m256d __B, const int __mask) { + return (__m256d)__builtin_ia32_shufpd256((__v4df)__A, (__v4df)__B, __mask); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_shuffle_ps(__m256 __A, __m256 __B, const int __mask) { + return (__m256)__builtin_ia32_shufps256((__v8sf)__A, (__v8sf)__B, __mask); +} +#else +#define _mm256_shuffle_pd(A, B, N) \ + ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(N))) + +#define _mm256_shuffle_ps(A, B, N) \ + ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), \ + (int)(N))) +#endif + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sub_pd(__m256d __A, __m256d __B) { + return (__m256d)((__v4df)__A - (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sub_ps(__m256 __A, __m256 __B) { + return (__m256)((__v8sf)__A - (__v8sf)__B); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_xor_pd(__m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_xorpd256((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_xor_ps(__m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_xorps256((__v8sf)__A, (__v8sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_pd(__m128d __X, __m128d __Y, const int __P) { + return (__m128d)__builtin_ia32_cmppd((__v2df)__X, (__v2df)__Y, __P); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_ps(__m128 __X, __m128 __Y, const int __P) { + return (__m128)__builtin_ia32_cmpps((__v4sf)__X, (__v4sf)__Y, __P); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_pd(__m256d __X, __m256d __Y, const int __P) { + return (__m256d)__builtin_ia32_cmppd256((__v4df)__X, (__v4df)__Y, __P); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cmp_ps(__m256 __X, __m256 __Y, const int __P) { + return (__m256)__builtin_ia32_cmpps256((__v8sf)__X, (__v8sf)__Y, __P); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_sd(__m128d __X, __m128d __Y, const int __P) { + return (__m128d)__builtin_ia32_cmpsd((__v2df)__X, (__v2df)__Y, __P); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmp_ss(__m128 __X, __m128 __Y, const int __P) { + return (__m128)__builtin_ia32_cmpss((__v4sf)__X, (__v4sf)__Y, __P); +} +#else +#define _mm_cmp_pd(X, Y, P) \ + ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ + (int)(P))) + +#define _mm_cmp_ps(X, Y, P) \ + ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ + (int)(P))) + +#define _mm256_cmp_pd(X, Y, P) \ + ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(P))) + +#define _mm256_cmp_ps(X, Y, P) \ + ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(X), (__v8sf)(__m256)(Y), \ + (int)(P))) + +#define _mm_cmp_sd(X, Y, P) \ + ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ + (int)(P))) + +#define _mm_cmp_ss(X, Y, P) \ + ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ + (int)(P))) +#endif + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi32_pd(__m128i __A) { + return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si)__A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtepi32_ps(__m256i __A) { + return (__m256)__builtin_ia32_cvtdq2ps256((__v8si)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtpd_ps(__m256d __A) { + return (__m128)__builtin_ia32_cvtpd2ps256((__v4df)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtps_epi32(__m256 __A) { + return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf)__A); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtps_pd(__m128 __A) { + return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvttpd_epi32(__m256d __A) { + return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtpd_epi32(__m256d __A) { + return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvttps_epi32(__m256 __A) { + return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf)__A); +} + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtsd_f64(__m256d __A) { + return __A[0]; +} + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtss_f32(__m256 __A) { + return __A[0]; +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extractf128_pd(__m256d __X, const int __N) { + return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__X, __N); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extractf128_ps(__m256 __X, const int __N) { + return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__X, __N); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extractf128_si256(__m256i __X, const int __N) { + return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__X, __N); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extract_epi32(__m256i __X, int const __N) { + __m128i __Y = _mm256_extractf128_si256(__X, __N >> 2); + return _mm_extract_epi32(__Y, __N % 4); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extract_epi16(__m256i __X, int const __N) { + __m128i __Y = _mm256_extractf128_si256(__X, __N >> 3); + return _mm_extract_epi16(__Y, __N % 8); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extract_epi8(__m256i __X, int const __N) { + __m128i __Y = _mm256_extractf128_si256(__X, __N >> 4); + return _mm_extract_epi8(__Y, __N % 16); +} + +#ifdef __x86_64__ +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_extract_epi64(__m256i __X, const int __N) { + __m128i __Y = _mm256_extractf128_si256(__X, __N >> 1); + return _mm_extract_epi64(__Y, __N % 2); +} +#endif +#else +#define _mm256_extractf128_pd(X, N) \ + ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(X), (int)(N))) + +#define _mm256_extractf128_ps(X, N) \ + ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(X), (int)(N))) + +#define _mm256_extractf128_si256(X, N) \ + ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(X), (int)(N))) + +#define _mm256_extract_epi32(X, N) \ + (__extension__({ \ + __m128i __Y = _mm256_extractf128_si256((X), (N) >> 2); \ + _mm_extract_epi32(__Y, (N) % 4); \ + })) + +#define _mm256_extract_epi16(X, N) \ + (__extension__({ \ + __m128i __Y = _mm256_extractf128_si256((X), (N) >> 3); \ + _mm_extract_epi16(__Y, (N) % 8); \ + })) + +#define _mm256_extract_epi8(X, N) \ + (__extension__({ \ + __m128i __Y = _mm256_extractf128_si256((X), (N) >> 4); \ + _mm_extract_epi8(__Y, (N) % 16); \ + })) + +#ifdef __x86_64__ +#define _mm256_extract_epi64(X, N) \ + (__extension__({ \ + __m128i __Y = _mm256_extractf128_si256((X), (N) >> 1); \ + _mm_extract_epi64(__Y, (N) % 2); \ + })) +#endif +#endif + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_zeroall(void) { + __builtin_ia32_vzeroall(); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_zeroupper(void) { + __builtin_ia32_vzeroupper(); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permutevar_pd(__m128d __A, __m128i __C) { + return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__A, (__v2di)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutevar_pd(__m256d __A, __m256i __C) { + return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__A, (__v4di)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permutevar_ps(__m128 __A, __m128i __C) { + return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__A, (__v4si)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permutevar_ps(__m256 __A, __m256i __C) { + return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__A, (__v8si)__C); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permute_pd(__m128d __X, const int __C) { + return (__m128d)__builtin_ia32_vpermilpd((__v2df)__X, __C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permute_pd(__m256d __X, const int __C) { + return (__m256d)__builtin_ia32_vpermilpd256((__v4df)__X, __C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permute_ps(__m128 __X, const int __C) { + return (__m128)__builtin_ia32_vpermilps((__v4sf)__X, __C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permute_ps(__m256 __X, const int __C) { + return (__m256)__builtin_ia32_vpermilps256((__v8sf)__X, __C); +} +#else +#define _mm_permute_pd(X, C) \ + ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(X), (int)(C))) + +#define _mm256_permute_pd(X, C) \ + ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(X), (int)(C))) + +#define _mm_permute_ps(X, C) \ + ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(X), (int)(C))) + +#define _mm256_permute_ps(X, C) \ + ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(X), (int)(C))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permute2f128_pd(__m256d __X, __m256d __Y, const int __C) { + return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__X, (__v4df)__Y, + __C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permute2f128_ps(__m256 __X, __m256 __Y, const int __C) { + return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__X, (__v8sf)__Y, __C); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permute2f128_si256(__m256i __X, __m256i __Y, const int __C) { + return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__X, (__v8si)__Y, + __C); +} +#else +#define _mm256_permute2f128_pd(X, Y, C) \ + ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(C))) + +#define _mm256_permute2f128_ps(X, Y, C) \ + ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(C))) + +#define _mm256_permute2f128_si256(X, Y, C) \ + ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(C))) +#endif + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_broadcast_ss(float const *__X) { + return (__m128)__builtin_ia32_vbroadcastss(__X); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcast_sd(double const *__X) { + return (__m256d)__builtin_ia32_vbroadcastsd256(__X); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcast_ss(float const *__X) { + return (__m256)__builtin_ia32_vbroadcastss256(__X); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcast_pd(__m128d const *__X) { + return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__X); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_broadcast_ps(__m128 const *__X) { + return (__m256)__builtin_ia32_vbroadcastf128_ps256(__X); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_insertf128_pd(__m256d __X, __m128d __Y, const int __O) { + return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__X, (__v2df)__Y, + __O); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_insertf128_ps(__m256 __X, __m128 __Y, const int __O) { + return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__X, (__v4sf)__Y, + __O); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_insertf128_si256(__m256i __X, __m128i __Y, const int __O) { + return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__X, (__v4si)__Y, + __O); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_insert_epi32(__m256i __X, int __D, int const __N) { + __m128i __Y = _mm256_extractf128_si256(__X, __N >> 2); + __Y = _mm_insert_epi32(__Y, __D, __N % 4); + return _mm256_insertf128_si256(__X, __Y, __N >> 2); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_insert_epi16(__m256i __X, int __D, int const __N) { + __m128i __Y = _mm256_extractf128_si256(__X, __N >> 3); + __Y = _mm_insert_epi16(__Y, __D, __N % 8); + return _mm256_insertf128_si256(__X, __Y, __N >> 3); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_insert_epi8(__m256i __X, int __D, int const __N) { + __m128i __Y = _mm256_extractf128_si256(__X, __N >> 4); + __Y = _mm_insert_epi8(__Y, __D, __N % 16); + return _mm256_insertf128_si256(__X, __Y, __N >> 4); +} + +#ifdef __x86_64__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_insert_epi64(__m256i __X, long long __D, int const __N) { + __m128i __Y = _mm256_extractf128_si256(__X, __N >> 1); + __Y = _mm_insert_epi64(__Y, __D, __N % 2); + return _mm256_insertf128_si256(__X, __Y, __N >> 1); +} +#endif +#else +#define _mm256_insertf128_pd(X, Y, O) \ + ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(X), \ + (__v2df)(__m128d)(Y), (int)(O))) + +#define _mm256_insertf128_ps(X, Y, O) \ + ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(X), \ + (__v4sf)(__m128)(Y), (int)(O))) + +#define _mm256_insertf128_si256(X, Y, O) \ + ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(X), \ + (__v4si)(__m128i)(Y), (int)(O))) + +#define _mm256_insert_epi32(X, D, N) \ + (__extension__({ \ + __m128i __Y = _mm256_extractf128_si256((X), (N) >> 2); \ + __Y = _mm_insert_epi32(__Y, (D), (N) % 4); \ + _mm256_insertf128_si256((X), __Y, (N) >> 2); \ + })) + +#define _mm256_insert_epi16(X, D, N) \ + (__extension__({ \ + __m128i __Y = _mm256_extractf128_si256((X), (N) >> 3); \ + __Y = _mm_insert_epi16(__Y, (D), (N) % 8); \ + _mm256_insertf128_si256((X), __Y, (N) >> 3); \ + })) + +#define _mm256_insert_epi8(X, D, N) \ + (__extension__({ \ + __m128i __Y = _mm256_extractf128_si256((X), (N) >> 4); \ + __Y = _mm_insert_epi8(__Y, (D), (N) % 16); \ + _mm256_insertf128_si256((X), __Y, (N) >> 4); \ + })) + +#ifdef __x86_64__ +#define _mm256_insert_epi64(X, D, N) \ + (__extension__({ \ + __m128i __Y = _mm256_extractf128_si256((X), (N) >> 1); \ + __Y = _mm_insert_epi64(__Y, (D), (N) % 2); \ + _mm256_insertf128_si256((X), __Y, (N) >> 1); \ + })) +#endif +#endif + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_load_pd(double const *__P) { + return *(__m256d *)__P; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_store_pd(double *__P, __m256d __A) { + *(__m256d *)__P = __A; +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_load_ps(float const *__P) { + return *(__m256 *)__P; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_store_ps(float *__P, __m256 __A) { + *(__m256 *)__P = __A; +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_loadu_pd(double const *__P) { + return *(__m256d_u *)__P; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_storeu_pd(double *__P, __m256d __A) { + *(__m256d_u *)__P = __A; +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_loadu_ps(float const *__P) { + return *(__m256_u *)__P; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_storeu_ps(float *__P, __m256 __A) { + *(__m256_u *)__P = __A; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_load_si256(__m256i const *__P) { + return *__P; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_store_si256(__m256i *__P, __m256i __A) { + *__P = __A; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_loadu_si256(__m256i_u const *__P) { + return *__P; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_storeu_si256(__m256i_u *__P, __m256i __A) { + *__P = __A; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskload_pd(double const *__P, __m128i __M) { + return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__P, (__v2di)__M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskstore_pd(double *__P, __m128i __M, __m128d __A) { + __builtin_ia32_maskstorepd((__v2df *)__P, (__v2di)__M, (__v2df)__A); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskload_pd(double const *__P, __m256i __M) { + return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__P, + (__v4di)__M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskstore_pd(double *__P, __m256i __M, __m256d __A) { + __builtin_ia32_maskstorepd256((__v4df *)__P, (__v4di)__M, (__v4df)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskload_ps(float const *__P, __m128i __M) { + return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__P, (__v4si)__M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskstore_ps(float *__P, __m128i __M, __m128 __A) { + __builtin_ia32_maskstoreps((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskload_ps(float const *__P, __m256i __M) { + return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__P, (__v8si)__M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskstore_ps(float *__P, __m256i __M, __m256 __A) { + __builtin_ia32_maskstoreps256((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movehdup_ps(__m256 __X) { + return (__m256)__builtin_ia32_movshdup256((__v8sf)__X); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_moveldup_ps(__m256 __X) { + return (__m256)__builtin_ia32_movsldup256((__v8sf)__X); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movedup_pd(__m256d __X) { + return (__m256d)__builtin_ia32_movddup256((__v4df)__X); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_lddqu_si256(__m256i const *__P) { + return (__m256i)__builtin_ia32_lddqu256((char const *)__P); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_stream_si256(__m256i *__A, __m256i __B) { + __builtin_ia32_movntdq256((__v4di *)__A, (__v4di)__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_stream_pd(double *__A, __m256d __B) { + __builtin_ia32_movntpd256(__A, (__v4df)__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_stream_ps(float *__P, __m256 __A) { + __builtin_ia32_movntps256(__P, (__v8sf)__A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rcp_ps(__m256 __A) { + return (__m256)__builtin_ia32_rcpps256((__v8sf)__A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_rsqrt_ps(__m256 __A) { + return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__A); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sqrt_pd(__m256d __A) { + return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_sqrt_ps(__m256 __A) { + return (__m256)__builtin_ia32_sqrtps256((__v8sf)__A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_round_pd(__m256d __V, const int __M) { + return (__m256d)__builtin_ia32_roundpd256((__v4df)__V, __M); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_round_ps(__m256 __V, const int __M) { + return (__m256)__builtin_ia32_roundps256((__v8sf)__V, __M); +} +#else +#define _mm256_round_pd(V, M) \ + ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (int)(M))) + +#define _mm256_round_ps(V, M) \ + ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (int)(M))) +#endif + +#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) +#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) +#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) +#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpackhi_pd(__m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_unpckhpd256((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpacklo_pd(__m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_unpcklpd256((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpackhi_ps(__m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_unpckhps256((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_unpacklo_ps(__m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_unpcklps256((__v8sf)__A, (__v8sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testz_pd(__m128d __M, __m128d __V) { + return __builtin_ia32_vtestzpd((__v2df)__M, (__v2df)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testc_pd(__m128d __M, __m128d __V) { + return __builtin_ia32_vtestcpd((__v2df)__M, (__v2df)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testnzc_pd(__m128d __M, __m128d __V) { + return __builtin_ia32_vtestnzcpd((__v2df)__M, (__v2df)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testz_ps(__m128 __M, __m128 __V) { + return __builtin_ia32_vtestzps((__v4sf)__M, (__v4sf)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testc_ps(__m128 __M, __m128 __V) { + return __builtin_ia32_vtestcps((__v4sf)__M, (__v4sf)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testnzc_ps(__m128 __M, __m128 __V) { + return __builtin_ia32_vtestnzcps((__v4sf)__M, (__v4sf)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testz_pd(__m256d __M, __m256d __V) { + return __builtin_ia32_vtestzpd256((__v4df)__M, (__v4df)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testc_pd(__m256d __M, __m256d __V) { + return __builtin_ia32_vtestcpd256((__v4df)__M, (__v4df)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testnzc_pd(__m256d __M, __m256d __V) { + return __builtin_ia32_vtestnzcpd256((__v4df)__M, (__v4df)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testz_ps(__m256 __M, __m256 __V) { + return __builtin_ia32_vtestzps256((__v8sf)__M, (__v8sf)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testc_ps(__m256 __M, __m256 __V) { + return __builtin_ia32_vtestcps256((__v8sf)__M, (__v8sf)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testnzc_ps(__m256 __M, __m256 __V) { + return __builtin_ia32_vtestnzcps256((__v8sf)__M, (__v8sf)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testz_si256(__m256i __M, __m256i __V) { + return __builtin_ia32_ptestz256((__v4di)__M, (__v4di)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testc_si256(__m256i __M, __m256i __V) { + return __builtin_ia32_ptestc256((__v4di)__M, (__v4di)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_testnzc_si256(__m256i __M, __m256i __V) { + return __builtin_ia32_ptestnzc256((__v4di)__M, (__v4di)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movemask_pd(__m256d __A) { + return __builtin_ia32_movmskpd256((__v4df)__A); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_movemask_ps(__m256 __A) { + return __builtin_ia32_movmskps256((__v8sf)__A); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_undefined_pd(void) { + __m256d __Y = __Y; + return __Y; +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_undefined_ps(void) { + __m256 __Y = __Y; + return __Y; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_undefined_si256(void) { + __m256i __Y = __Y; + return __Y; +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setzero_pd(void) { + return __extension__(__m256d){0.0, 0.0, 0.0, 0.0}; +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setzero_ps(void) { + return __extension__(__m256){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setzero_si256(void) { + return __extension__(__m256i)(__v4di){0, 0, 0, 0}; +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set_pd(double __A, double __B, double __C, double __D) { + return __extension__(__m256d){__D, __C, __B, __A}; +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set_ps(float __A, float __B, float __C, float __D, float __E, + float __F, float __G, float __H) { + return __extension__(__m256){__H, __G, __F, __E, __D, __C, __B, __A}; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set_epi32(int __A, int __B, int __C, int __D, int __E, int __F, + int __G, int __H) { + return __extension__(__m256i)(__v8si){__H, __G, __F, __E, __D, __C, __B, __A}; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set_epi16(short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) { + return __extension__(__m256i)(__v16hi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set_epi8(char __q31, char __q30, char __q29, char __q28, char __q27, + char __q26, char __q25, char __q24, char __q23, char __q22, + char __q21, char __q20, char __q19, char __q18, char __q17, + char __q16, char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, char __q07, + char __q06, char __q05, char __q04, char __q03, char __q02, + char __q01, char __q00) { + return __extension__(__m256i)(__v32qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31}; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_set_epi64x(long long __A, long long __B, long long __C, long long __D) { + return __extension__(__m256i)(__v4di){__D, __C, __B, __A}; +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set1_pd(double __A) { + return __extension__(__m256d){__A, __A, __A, __A}; +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set1_ps(float __A) { + return __extension__(__m256){__A, __A, __A, __A, __A, __A, __A, __A}; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set1_epi32(int __A) { + return __extension__(__m256i)(__v8si){__A, __A, __A, __A, __A, __A, __A, __A}; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set1_epi16(short __A) { + return _mm256_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set1_epi8(char __A) { + return _mm256_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set1_epi64x(long long __A) { + return __extension__(__m256i)(__v4di){__A, __A, __A, __A}; +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setr_pd(double __A, double __B, double __C, double __D) { + return _mm256_set_pd(__D, __C, __B, __A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setr_ps(float __A, float __B, float __C, float __D, float __E, + float __F, float __G, float __H) { + return _mm256_set_ps(__H, __G, __F, __E, __D, __C, __B, __A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setr_epi32(int __A, int __B, int __C, int __D, int __E, int __F, + int __G, int __H) { + return _mm256_set_epi32(__H, __G, __F, __E, __D, __C, __B, __A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setr_epi16(short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) { + return _mm256_set_epi16(__q00, __q01, __q02, __q03, __q04, __q05, __q06, + __q07, __q08, __q09, __q10, __q11, __q12, __q13, + __q14, __q15); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setr_epi8(char __q31, char __q30, char __q29, char __q28, char __q27, + char __q26, char __q25, char __q24, char __q23, char __q22, + char __q21, char __q20, char __q19, char __q18, char __q17, + char __q16, char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, char __q07, + char __q06, char __q05, char __q04, char __q03, char __q02, + char __q01, char __q00) { + return _mm256_set_epi8(__q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, __q28, __q29, __q30, + __q31); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm256_setr_epi64x(long long __A, long long __B, long long __C, long long __D) { + return _mm256_set_epi64x(__D, __C, __B, __A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castpd_ps(__m256d __A) { + return (__m256)__A; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castpd_si256(__m256d __A) { + return (__m256i)__A; +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castps_pd(__m256 __A) { + return (__m256d)__A; +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castps_si256(__m256 __A) { + return (__m256i)__A; +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castsi256_ps(__m256i __A) { + return (__m256)__A; +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castsi256_pd(__m256i __A) { + return (__m256d)__A; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castpd256_pd128(__m256d __A) { + return (__m128d)__builtin_ia32_pd_pd256((__v4df)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castps256_ps128(__m256 __A) { + return (__m128)__builtin_ia32_ps_ps256((__v8sf)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castsi256_si128(__m256i __A) { + return (__m128i)__builtin_ia32_si_si256((__v8si)__A); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castpd128_pd256(__m128d __A) { + return (__m256d)__builtin_ia32_pd256_pd((__v2df)__A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castps128_ps256(__m128 __A) { + return (__m256)__builtin_ia32_ps256_ps((__v4sf)__A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_castsi128_si256(__m128i __A) { + return (__m256i)__builtin_ia32_si256_si((__v4si)__A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set_m128(__m128 __H, __m128 __L) { + return _mm256_insertf128_ps(_mm256_castps128_ps256(__L), __H, 1); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set_m128d(__m128d __H, __m128d __L) { + return _mm256_insertf128_pd(_mm256_castpd128_pd256(__L), __H, 1); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_set_m128i(__m128i __H, __m128i __L) { + return _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setr_m128(__m128 __L, __m128 __H) { + return _mm256_set_m128(__H, __L); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setr_m128d(__m128d __L, __m128d __H) { + return _mm256_set_m128d(__H, __L); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_setr_m128i(__m128i __L, __m128i __H) { + return _mm256_set_m128i(__H, __L); +} + +#ifdef __DISABLE_AVX__ +#undef __DISABLE_AVX__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX__ */ + +#endif /* _AVXINTRIN_H_INCLUDED */ diff --git a/third_party/intel/bmi2intrin.internal.h b/third_party/intel/bmi2intrin.internal.h new file mode 100644 index 000000000..d4c1e7499 --- /dev/null +++ b/third_party/intel/bmi2intrin.internal.h @@ -0,0 +1,78 @@ +#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _BMI2INTRIN_H_INCLUDED +#define _BMI2INTRIN_H_INCLUDED + +#ifndef __BMI2__ +#pragma GCC push_options +#pragma GCC target("bmi2") +#define __DISABLE_BMI2__ +#endif /* __BMI2__ */ + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bzhi_u32(unsigned int __X, unsigned int __Y) { + return __builtin_ia32_bzhi_si(__X, __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pdep_u32(unsigned int __X, unsigned int __Y) { + return __builtin_ia32_pdep_si(__X, __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pext_u32(unsigned int __X, unsigned int __Y) { + return __builtin_ia32_pext_si(__X, __Y); +} + +#ifdef __x86_64__ + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bzhi_u64(unsigned long long __X, unsigned long long __Y) { + return __builtin_ia32_bzhi_di(__X, __Y); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pdep_u64(unsigned long long __X, unsigned long long __Y) { + return __builtin_ia32_pdep_di(__X, __Y); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pext_u64(unsigned long long __X, unsigned long long __Y) { + return __builtin_ia32_pext_di(__X, __Y); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mulx_u64(unsigned long long __X, unsigned long long __Y, + unsigned long long *__P) { + unsigned __int128 __res = (unsigned __int128)__X * __Y; + *__P = (unsigned long long)(__res >> 64); + return (unsigned long long)__res; +} + +#else /* !__x86_64__ */ + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { + unsigned long long __res = (unsigned long long)__X * __Y; + *__P = (unsigned int)(__res >> 32); + return (unsigned int)__res; +} + +#endif /* !__x86_64__ */ + +#ifdef __DISABLE_BMI2__ +#undef __DISABLE_BMI2__ +#pragma GCC pop_options +#endif /* __DISABLE_BMI2__ */ + +#endif /* _BMI2INTRIN_H_INCLUDED */ diff --git a/third_party/intel/bmiintrin.internal.h b/third_party/intel/bmiintrin.internal.h new file mode 100644 index 000000000..9a4bab63b --- /dev/null +++ b/third_party/intel/bmiintrin.internal.h @@ -0,0 +1,160 @@ +#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _BMIINTRIN_H_INCLUDED +#define _BMIINTRIN_H_INCLUDED + +#ifndef __BMI__ +#pragma GCC push_options +#pragma GCC target("bmi") +#define __DISABLE_BMI__ +#endif /* __BMI__ */ + +extern __inline unsigned short + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u16(unsigned short __X) { + return __builtin_ia32_tzcnt_u16(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __andn_u32(unsigned int __X, unsigned int __Y) { + return ~__X & __Y; +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextr_u32(unsigned int __X, unsigned int __Y) { + return __builtin_ia32_bextr_u32(__X, __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bextr_u32(unsigned int __X, unsigned int __Y, unsigned __Z) { + return __builtin_ia32_bextr_u32(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsi_u32(unsigned int __X) { + return __X & -__X; +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsi_u32(unsigned int __X) { + return __blsi_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsmsk_u32(unsigned int __X) { + return __X ^ (__X - 1); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsmsk_u32(unsigned int __X) { + return __blsmsk_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsr_u32(unsigned int __X) { + return __X & (__X - 1); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsr_u32(unsigned int __X) { + return __blsr_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u32(unsigned int __X) { + return __builtin_ia32_tzcnt_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tzcnt_u32(unsigned int __X) { + return __builtin_ia32_tzcnt_u32(__X); +} + +#ifdef __x86_64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __andn_u64(unsigned long long __X, unsigned long long __Y) { + return ~__X & __Y; +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextr_u64(unsigned long long __X, unsigned long long __Y) { + return __builtin_ia32_bextr_u64(__X, __Y); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) { + return __builtin_ia32_bextr_u64(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsi_u64(unsigned long long __X) { + return __X & -__X; +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsi_u64(unsigned long long __X) { + return __blsi_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsmsk_u64(unsigned long long __X) { + return __X ^ (__X - 1); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsmsk_u64(unsigned long long __X) { + return __blsmsk_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsr_u64(unsigned long long __X) { + return __X & (__X - 1); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsr_u64(unsigned long long __X) { + return __blsr_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u64(unsigned long long __X) { + return __builtin_ia32_tzcnt_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tzcnt_u64(unsigned long long __X) { + return __builtin_ia32_tzcnt_u64(__X); +} + +#endif /* __x86_64__ */ + +#ifdef __DISABLE_BMI__ +#undef __DISABLE_BMI__ +#pragma GCC pop_options +#endif /* __DISABLE_BMI__ */ + +#endif /* _BMIINTRIN_H_INCLUDED */ diff --git a/third_party/intel/cetintrin.internal.h b/third_party/intel/cetintrin.internal.h new file mode 100644 index 000000000..fa31a21b5 --- /dev/null +++ b/third_party/intel/cetintrin.internal.h @@ -0,0 +1,95 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _CETINTRIN_H_INCLUDED +#define _CETINTRIN_H_INCLUDED + +#ifndef __SHSTK__ +#pragma GCC push_options +#pragma GCC target("shstk") +#define __DISABLE_SHSTK__ +#endif /* __SHSTK__ */ + +#ifdef __x86_64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _get_ssp(void) { + return __builtin_ia32_rdsspq(); +} +#else +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _get_ssp(void) { + return __builtin_ia32_rdsspd(); +} +#endif + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _inc_ssp(unsigned int __B) { +#ifdef __x86_64__ + __builtin_ia32_incsspq((unsigned long long)__B); +#else + __builtin_ia32_incsspd(__B); +#endif +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _saveprevssp(void) { + __builtin_ia32_saveprevssp(); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _rstorssp(void *__B) { + __builtin_ia32_rstorssp(__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _wrssd(unsigned int __B, void *__C) { + __builtin_ia32_wrssd(__B, __C); +} + +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _wrssq(unsigned long long __B, void *__C) { + __builtin_ia32_wrssq(__B, __C); +} +#endif + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _wrussd(unsigned int __B, void *__C) { + __builtin_ia32_wrussd(__B, __C); +} + +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _wrussq(unsigned long long __B, void *__C) { + __builtin_ia32_wrussq(__B, __C); +} +#endif + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _setssbsy(void) { + __builtin_ia32_setssbsy(); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _clrssbsy(void *__B) { + __builtin_ia32_clrssbsy(__B); +} + +#ifdef __DISABLE_SHSTK__ +#undef __DISABLE_SHSTK__ +#pragma GCC pop_options +#endif /* __DISABLE_SHSTK__ */ + +#endif /* _CETINTRIN_H_INCLUDED. */ diff --git a/third_party/intel/cldemoteintrin.internal.h b/third_party/intel/cldemoteintrin.internal.h new file mode 100644 index 000000000..7a053d6fa --- /dev/null +++ b/third_party/intel/cldemoteintrin.internal.h @@ -0,0 +1,23 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _CLDEMOTE_H_INCLUDED +#define _CLDEMOTE_H_INCLUDED + +#ifndef __CLDEMOTE__ +#pragma GCC push_options +#pragma GCC target("cldemote") +#define __DISABLE_CLDEMOTE__ +#endif /* __CLDEMOTE__ */ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cldemote(void *__A) { + __builtin_ia32_cldemote(__A); +} +#ifdef __DISABLE_CLDEMOTE__ +#undef __DISABLE_CLDEMOTE__ +#pragma GCC pop_options +#endif /* __DISABLE_CLDEMOTE__ */ + +#endif /* _CLDEMOTE_H_INCLUDED */ diff --git a/third_party/intel/clflushoptintrin.internal.h b/third_party/intel/clflushoptintrin.internal.h new file mode 100644 index 000000000..da1d119eb --- /dev/null +++ b/third_party/intel/clflushoptintrin.internal.h @@ -0,0 +1,25 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _CLFLUSHOPTINTRIN_H_INCLUDED +#define _CLFLUSHOPTINTRIN_H_INCLUDED + +#ifndef __CLFLUSHOPT__ +#pragma GCC push_options +#pragma GCC target("clflushopt") +#define __DISABLE_CLFLUSHOPT__ +#endif /* __CLFLUSHOPT__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_clflushopt(void *__A) { + __builtin_ia32_clflushopt(__A); +} + +#ifdef __DISABLE_CLFLUSHOPT__ +#undef __DISABLE_CLFLUSHOPT__ +#pragma GCC pop_options +#endif /* __DISABLE_CLFLUSHOPT__ */ + +#endif /* _CLFLUSHOPTINTRIN_H_INCLUDED */ diff --git a/third_party/intel/clwbintrin.internal.h b/third_party/intel/clwbintrin.internal.h new file mode 100644 index 000000000..3180c94db --- /dev/null +++ b/third_party/intel/clwbintrin.internal.h @@ -0,0 +1,25 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _CLWBINTRIN_H_INCLUDED +#define _CLWBINTRIN_H_INCLUDED + +#ifndef __CLWB__ +#pragma GCC push_options +#pragma GCC target("clwb") +#define __DISABLE_CLWB__ +#endif /* __CLWB__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_clwb(void *__A) { + __builtin_ia32_clwb(__A); +} + +#ifdef __DISABLE_CLWB__ +#undef __DISABLE_CLWB__ +#pragma GCC pop_options +#endif /* __DISABLE_CLWB__ */ + +#endif /* _CLWBINTRIN_H_INCLUDED */ diff --git a/third_party/intel/clzerointrin.internal.h b/third_party/intel/clzerointrin.internal.h new file mode 100644 index 000000000..b9841d7cc --- /dev/null +++ b/third_party/intel/clzerointrin.internal.h @@ -0,0 +1,21 @@ +#ifndef _CLZEROINTRIN_H_INCLUDED +#define _CLZEROINTRIN_H_INCLUDED + +#ifndef __CLZERO__ +#pragma GCC push_options +#pragma GCC target("clzero") +#define __DISABLE_CLZERO__ +#endif /* __CLZERO__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_clzero(void* __I) { + __builtin_ia32_clzero(__I); +} + +#ifdef __DISABLE_CLZERO__ +#undef __DISABLE_CLZERO__ +#pragma GCC pop_options +#endif /* __DISABLE_CLZERO__ */ + +#endif /* _CLZEROINTRIN_H_INCLUDED */ diff --git a/third_party/intel/cpuid.internal.h b/third_party/intel/cpuid.internal.h new file mode 100644 index 000000000..c2918deed --- /dev/null +++ b/third_party/intel/cpuid.internal.h @@ -0,0 +1,235 @@ +#ifndef COSMOPOLITAN_THIRD_PARTY_INTEL_CPUID_INTERNAL_H_ +#define COSMOPOLITAN_THIRD_PARTY_INTEL_CPUID_INTERNAL_H_ +#if !(__ASSEMBLER__ + __LINKER__ + 0) + +#define bit_SSE3 (1 << 0) +#define bit_PCLMUL (1 << 1) +#define bit_LZCNT (1 << 5) +#define bit_SSSE3 (1 << 9) +#define bit_FMA (1 << 12) +#define bit_CMPXCHG16B (1 << 13) +#define bit_SSE4_1 (1 << 19) +#define bit_SSE4_2 (1 << 20) +#define bit_MOVBE (1 << 22) +#define bit_POPCNT (1 << 23) +#define bit_AES (1 << 25) +#define bit_XSAVE (1 << 26) +#define bit_OSXSAVE (1 << 27) +#define bit_AVX (1 << 28) +#define bit_F16C (1 << 29) +#define bit_RDRND (1 << 30) + +#define bit_CMPXCHG8B (1 << 8) +#define bit_CMOV (1 << 15) +#define bit_MMX (1 << 23) +#define bit_FXSAVE (1 << 24) +#define bit_SSE (1 << 25) +#define bit_SSE2 (1 << 26) + +#define bit_LAHF_LM (1 << 0) +#define bit_ABM (1 << 5) +#define bit_SSE4a (1 << 6) +#define bit_PRFCHW (1 << 8) +#define bit_XOP (1 << 11) +#define bit_LWP (1 << 15) +#define bit_FMA4 (1 << 16) +#define bit_TBM (1 << 21) +#define bit_MWAITX (1 << 29) + +#define bit_MMXEXT (1 << 22) +#define bit_LM (1 << 29) +#define bit_3DNOWP (1 << 30) +#define bit_3DNOW (1u << 31) + +#define bit_CLZERO (1 << 0) +#define bit_WBNOINVD (1 << 9) + +#define bit_FSGSBASE (1 << 0) +#define bit_SGX (1 << 2) +#define bit_BMI (1 << 3) +#define bit_HLE (1 << 4) +#define bit_AVX2 (1 << 5) +#define bit_BMI2 (1 << 8) +#define bit_RTM (1 << 11) +#define bit_MPX (1 << 14) +#define bit_AVX512F (1 << 16) +#define bit_AVX512DQ (1 << 17) +#define bit_RDSEED (1 << 18) +#define bit_ADX (1 << 19) +#define bit_AVX512IFMA (1 << 21) +#define bit_CLFLUSHOPT (1 << 23) +#define bit_CLWB (1 << 24) +#define bit_AVX512PF (1 << 26) +#define bit_AVX512ER (1 << 27) +#define bit_AVX512CD (1 << 28) +#define bit_SHA (1 << 29) +#define bit_AVX512BW (1 << 30) +#define bit_AVX512VL (1u << 31) + +#define bit_PREFETCHWT1 (1 << 0) +#define bit_AVX512VBMI (1 << 1) +#define bit_PKU (1 << 3) +#define bit_OSPKE (1 << 4) +#define bit_WAITPKG (1 << 5) +#define bit_AVX512VBMI2 (1 << 6) +#define bit_SHSTK (1 << 7) +#define bit_GFNI (1 << 8) +#define bit_VAES (1 << 9) +#define bit_AVX512VNNI (1 << 11) +#define bit_VPCLMULQDQ (1 << 10) +#define bit_AVX512BITALG (1 << 12) +#define bit_AVX512VPOPCNTDQ (1 << 14) +#define bit_RDPID (1 << 22) +#define bit_MOVDIRI (1 << 27) +#define bit_MOVDIR64B (1 << 28) +#define bit_CLDEMOTE (1 << 25) + +#define bit_AVX5124VNNIW (1 << 2) +#define bit_AVX5124FMAPS (1 << 3) +#define bit_IBT (1 << 20) +#define bit_PCONFIG (1 << 18) + +#define bit_BNDREGS (1 << 3) +#define bit_BNDCSR (1 << 4) + +#define bit_XSAVEOPT (1 << 0) +#define bit_XSAVEC (1 << 1) +#define bit_XSAVES (1 << 3) + +#define bit_PTWRITE (1 << 4) + +#define signature_AMD_ebx 0x68747541 +#define signature_AMD_ecx 0x444d4163 +#define signature_AMD_edx 0x69746e65 + +#define signature_CENTAUR_ebx 0x746e6543 +#define signature_CENTAUR_ecx 0x736c7561 +#define signature_CENTAUR_edx 0x48727561 + +#define signature_CYRIX_ebx 0x69727943 +#define signature_CYRIX_ecx 0x64616574 +#define signature_CYRIX_edx 0x736e4978 + +#define signature_INTEL_ebx 0x756e6547 +#define signature_INTEL_ecx 0x6c65746e +#define signature_INTEL_edx 0x49656e69 + +#define signature_TM1_ebx 0x6e617254 +#define signature_TM1_ecx 0x55504361 +#define signature_TM1_edx 0x74656d73 + +#define signature_TM2_ebx 0x756e6547 +#define signature_TM2_ecx 0x3638784d +#define signature_TM2_edx 0x54656e69 + +#define signature_NSC_ebx 0x646f6547 +#define signature_NSC_ecx 0x43534e20 +#define signature_NSC_edx 0x79622065 + +#define signature_NEXGEN_ebx 0x4778654e +#define signature_NEXGEN_ecx 0x6e657669 +#define signature_NEXGEN_edx 0x72446e65 + +#define signature_RISE_ebx 0x65736952 +#define signature_RISE_ecx 0x65736952 +#define signature_RISE_edx 0x65736952 + +#define signature_SIS_ebx 0x20536953 +#define signature_SIS_ecx 0x20536953 +#define signature_SIS_edx 0x20536953 + +#define signature_UMC_ebx 0x20434d55 +#define signature_UMC_ecx 0x20434d55 +#define signature_UMC_edx 0x20434d55 + +#define signature_VIA_ebx 0x20414956 +#define signature_VIA_ecx 0x20414956 +#define signature_VIA_edx 0x20414956 + +#define signature_VORTEX_ebx 0x74726f56 +#define signature_VORTEX_ecx 0x436f5320 +#define signature_VORTEX_edx 0x36387865 + +#ifndef __x86_64__ + +#define __cpuid(level, a, b, c, d) \ + do { \ + if (__builtin_constant_p(level) && (level) != 1) \ + __asm__("cpuid\n\t" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(level)); \ + else \ + __asm__("cpuid\n\t" \ + : "=a"(a), "=b"(b), "=c"(c), "=d"(d) \ + : "0"(level), "1"(0), "2"(0)); \ + } while (0) +#else +#define __cpuid(level, a, b, c, d) \ + __asm__("cpuid\n\t" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(level)) +#endif + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__("cpuid\n\t" \ + : "=a"(a), "=b"(b), "=c"(c), "=d"(d) \ + : "0"(level), "2"(count)) + +static __inline unsigned int __get_cpuid_max(unsigned int __ext, + unsigned int *__sig) { + unsigned int __eax, __ebx, __ecx, __edx; +#ifndef __x86_64__ +#if __GNUC__ >= 3 + __asm__("pushf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "mov{l}\t{%0, %1|%1, %0}\n\t" + "xor{l}\t{%2, %0|%0, %2}\n\t" + "push{l}\t%0\n\t" + "popf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "popf{l|d}\n\t" + : "=&r"(__eax), "=&r"(__ebx) + : "i"(0x00200000)); +#else + __asm__("pushfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "movl\t%0, %1\n\t" + "xorl\t%2, %0\n\t" + "pushl\t%0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "popfl\n\t" + : "=&r"(__eax), "=&r"(__ebx) + : "i"(0x00200000)); +#endif + if (!((__eax ^ __ebx) & 0x00200000)) return 0; +#endif + __cpuid(__ext, __eax, __ebx, __ecx, __edx); + if (__sig) *__sig = __ebx; + return __eax; +} + +static __inline int __get_cpuid(unsigned int __leaf, unsigned int *__eax, + unsigned int *__ebx, unsigned int *__ecx, + unsigned int *__edx) { + unsigned int __ext = __leaf & 0x80000000; + unsigned int __maxlevel = __get_cpuid_max(__ext, 0); + if (__maxlevel == 0 || __maxlevel < __leaf) return 0; + __cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx); + return 1; +} + +static __inline int __get_cpuid_count(unsigned int __leaf, + unsigned int __subleaf, + unsigned int *__eax, unsigned int *__ebx, + unsigned int *__ecx, + unsigned int *__edx) { + unsigned int __ext = __leaf & 0x80000000; + unsigned int __maxlevel = __get_cpuid_max(__ext, 0); + if (__maxlevel == 0 || __maxlevel < __leaf) return 0; + __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx); + return 1; +} + +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_THIRD_PARTY_INTEL_CPUID_INTERNAL_H_ */ diff --git a/third_party/intel/emmintrin.internal.h b/third_party/intel/emmintrin.internal.h new file mode 100644 index 000000000..f920905d5 --- /dev/null +++ b/third_party/intel/emmintrin.internal.h @@ -0,0 +1,1497 @@ +#ifndef _EMMINTRIN_H_INCLUDED +#define _EMMINTRIN_H_INCLUDED +#include "third_party/intel/xmmintrin.internal.h" + +#ifndef __SSE2__ +#pragma GCC push_options +#pragma GCC target("sse2") +#define __DISABLE_SSE2__ +#endif /* __SSE2__ */ + +typedef double __v2df __attribute__((__vector_size__(16))); +typedef long long __v2di __attribute__((__vector_size__(16))); +typedef unsigned long long __v2du __attribute__((__vector_size__(16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef unsigned int __v4su __attribute__((__vector_size__(16))); +typedef short __v8hi __attribute__((__vector_size__(16))); +typedef unsigned short __v8hu __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); +typedef signed char __v16qs __attribute__((__vector_size__(16))); +typedef unsigned char __v16qu __attribute__((__vector_size__(16))); + +typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__)); +typedef double __m128d __attribute__((__vector_size__(16), __may_alias__)); + +typedef long long __m128i_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); +typedef double __m128d_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); + +#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0)) + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_sd(double __F) { + return __extension__(__m128d){__F, 0.0}; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_pd(double __F) { + return __extension__(__m128d){__F, __F}; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pd1(double __F) { + return _mm_set1_pd(__F); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pd(double __W, double __X) { + return __extension__(__m128d){__X, __W}; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_pd(double __W, double __X) { + return __extension__(__m128d){__W, __X}; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_undefined_pd(void) { + __m128d __Y = __Y; + return __Y; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setzero_pd(void) { + return __extension__(__m128d){0.0, 0.0}; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_move_sd(__m128d __A, __m128d __B) { + return __extension__(__m128d) + __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1}); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_pd(double const *__P) { + return *(__m128d *)__P; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadu_pd(double const *__P) { + return *(__m128d_u *)__P; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load1_pd(double const *__P) { + return _mm_set1_pd(*__P); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_sd(double const *__P) { + return _mm_set_sd(*__P); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_pd1(double const *__P) { + return _mm_load1_pd(__P); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadr_pd(double const *__P) { + __m128d __tmp = _mm_load_pd(__P); + return __builtin_ia32_shufpd(__tmp, __tmp, _MM_SHUFFLE2(0, 1)); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_pd(double *__P, __m128d __A) { + *(__m128d *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeu_pd(double *__P, __m128d __A) { + *(__m128d_u *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_sd(double *__P, __m128d __A) { + *__P = ((__v2df)__A)[0]; +} + +extern __inline double + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_f64(__m128d __A) { + return ((__v2df)__A)[0]; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storel_pd(double *__P, __m128d __A) { + _mm_store_sd(__P, __A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeh_pd(double *__P, __m128d __A) { + *__P = ((__v2df)__A)[1]; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store1_pd(double *__P, __m128d __A) { + _mm_store_pd(__P, __builtin_ia32_shufpd(__A, __A, _MM_SHUFFLE2(0, 0))); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_pd1(double *__P, __m128d __A) { + _mm_store1_pd(__P, __A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storer_pd(double *__P, __m128d __A) { + _mm_store_pd(__P, __builtin_ia32_shufpd(__A, __A, _MM_SHUFFLE2(0, 1))); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi128_si32(__m128i __A) { + return __builtin_ia32_vec_ext_v4si((__v4si)__A, 0); +} + +#ifdef __x86_64__ + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi128_si64(__m128i __A) { + return ((__v2di)__A)[0]; +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi128_si64x(__m128i __A) { + return ((__v2di)__A)[0]; +} +#endif + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_pd(__m128d __A, __m128d __B) { + return (__m128d)((__v2df)__A + (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_addsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_pd(__m128d __A, __m128d __B) { + return (__m128d)((__v2df)__A - (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_subsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_pd(__m128d __A, __m128d __B) { + return (__m128d)((__v2df)__A * (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_mulsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_pd(__m128d __A, __m128d __B) { + return (__m128d)((__v2df)__A / (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_divsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_pd(__m128d __A) { + return (__m128d)__builtin_ia32_sqrtpd((__v2df)__A); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_sd(__m128d __A, __m128d __B) { + __v2df __tmp = __builtin_ia32_movsd((__v2df)__A, (__v2df)__B); + return (__m128d)__builtin_ia32_sqrtsd((__v2df)__tmp); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_minpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_minsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_maxpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_maxsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_and_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_andpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_andnot_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_andnpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_orpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_xorpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpltpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmplepd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpgtpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpgepd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnlt_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnle_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpngt_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpngtpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnge_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpngepd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpord_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpordpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpunord_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpltsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmplesd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_movsd( + (__v2df)__A, (__v2df)__builtin_ia32_cmpltsd((__v2df)__B, (__v2df)__A)); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_movsd( + (__v2df)__A, (__v2df)__builtin_ia32_cmplesd((__v2df)__B, (__v2df)__A)); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnlt_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnle_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpngt_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_movsd( + (__v2df)__A, (__v2df)__builtin_ia32_cmpnltsd((__v2df)__B, (__v2df)__A)); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnge_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_movsd( + (__v2df)__A, (__v2df)__builtin_ia32_cmpnlesd((__v2df)__B, (__v2df)__A)); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpord_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpordsd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpunord_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comieq_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_comisdeq((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comilt_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_comisdlt((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comile_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_comisdle((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comigt_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_comisdgt((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comige_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_comisdge((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comineq_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_comisdneq((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomieq_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_ucomisdeq((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomilt_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_ucomisdlt((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomile_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_ucomisdle((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomigt_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_ucomisdgt((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomige_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_ucomisdge((__v2df)__A, (__v2df)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomineq_sd(__m128d __A, __m128d __B) { + return __builtin_ia32_ucomisdneq((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_epi64x(long long __q1, long long __q0) { + return __extension__(__m128i)(__v2di){__q0, __q1}; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_epi64(__m64 __q1, __m64 __q0) { + return _mm_set_epi64x((long long)__q1, (long long)__q0); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) { + return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3}; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3, + short __q2, short __q1, short __q0) { + return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3, + __q4, __q5, __q6, __q7}; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11, + char __q10, char __q09, char __q08, char __q07, char __q06, + char __q05, char __q04, char __q03, char __q02, char __q01, + char __q00) { + return __extension__(__m128i)(__v16qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_epi64x(long long __A) { + return _mm_set_epi64x(__A, __A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_epi64(__m64 __A) { + return _mm_set_epi64(__A, __A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_epi32(int __A) { + return _mm_set_epi32(__A, __A, __A, __A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_epi16(short __A) { + return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_epi8(char __A) { + return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_epi64(__m64 __q0, __m64 __q1) { + return _mm_set_epi64(__q1, __q0); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) { + return _mm_set_epi32(__q3, __q2, __q1, __q0); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4, + short __q5, short __q6, short __q7) { + return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04, + char __q05, char __q06, char __q07, char __q08, char __q09, + char __q10, char __q11, char __q12, char __q13, char __q14, + char __q15) { + return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, + __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_si128(__m128i const *__P) { + return *__P; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadu_si128(__m128i_u const *__P) { + return *__P; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadl_epi64(__m128i_u const *__P) { + return _mm_set_epi64((__m64)0LL, *(__m64_u *)__P); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadu_si64(void const *__P) { + return _mm_loadl_epi64((__m128i_u *)__P); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_si128(__m128i *__P, __m128i __B) { + *__P = __B; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeu_si128(__m128i_u *__P, __m128i __B) { + *__P = __B; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storel_epi64(__m128i_u *__P, __m128i __B) { + *(__m64_u *)__P = (__m64)((__v2di)__B)[0]; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeu_si64(void *__P, __m128i __B) { + _mm_storel_epi64((__m128i_u *)__P, __B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movepi64_pi64(__m128i __B) { + return (__m64)((__v2di)__B)[0]; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movpi64_epi64(__m64 __A) { + return _mm_set_epi64((__m64)0LL, __A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_move_epi64(__m128i __A) { + return (__m128i)__builtin_ia32_movq128((__v2di)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_undefined_si128(void) { + __m128i __Y = __Y; + return __Y; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setzero_si128(void) { + return __extension__(__m128i)(__v4si){0, 0, 0, 0}; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi32_pd(__m128i __A) { + return (__m128d)__builtin_ia32_cvtdq2pd((__v4si)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi32_ps(__m128i __A) { + return (__m128)__builtin_ia32_cvtdq2ps((__v4si)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpd_epi32(__m128d __A) { + return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpd_pi32(__m128d __A) { + return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpd_ps(__m128d __A) { + return (__m128)__builtin_ia32_cvtpd2ps((__v2df)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttpd_epi32(__m128d __A) { + return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttpd_pi32(__m128d __A) { + return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__A); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi32_pd(__m64 __A) { + return (__m128d)__builtin_ia32_cvtpi2pd((__v2si)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_epi32(__m128 __A) { + return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttps_epi32(__m128 __A) { + return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__A); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pd(__m128 __A) { + return (__m128d)__builtin_ia32_cvtps2pd((__v4sf)__A); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_si32(__m128d __A) { + return __builtin_ia32_cvtsd2si((__v2df)__A); +} + +#ifdef __x86_64__ + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_si64(__m128d __A) { + return __builtin_ia32_cvtsd2si64((__v2df)__A); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_si64x(__m128d __A) { + return __builtin_ia32_cvtsd2si64((__v2df)__A); +} +#endif + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttsd_si32(__m128d __A) { + return __builtin_ia32_cvttsd2si((__v2df)__A); +} + +#ifdef __x86_64__ + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttsd_si64(__m128d __A) { + return __builtin_ia32_cvttsd2si64((__v2df)__A); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttsd_si64x(__m128d __A) { + return __builtin_ia32_cvttsd2si64((__v2df)__A); +} +#endif + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsd_ss(__m128 __A, __m128d __B) { + return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi32_sd(__m128d __A, int __B) { + return (__m128d)__builtin_ia32_cvtsi2sd((__v2df)__A, __B); +} + +#ifdef __x86_64__ + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_sd(__m128d __A, long long __B) { + return (__m128d)__builtin_ia32_cvtsi642sd((__v2df)__A, __B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64x_sd(__m128d __A, long long __B) { + return (__m128d)__builtin_ia32_cvtsi642sd((__v2df)__A, __B); +} +#endif + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_sd(__m128d __A, __m128 __B) { + return (__m128d)__builtin_ia32_cvtss2sd((__v2df)__A, (__v4sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) { + return (__m128d)__builtin_ia32_shufpd((__v2df)__A, (__v2df)__B, __mask); +} +#else +#define _mm_shuffle_pd(A, B, N) \ + ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ + (int)(N))) +#endif + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_unpckhpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_pd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_unpcklpd((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadh_pd(__m128d __A, double const *__B) { + return (__m128d)__builtin_ia32_loadhpd((__v2df)__A, __B); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadl_pd(__m128d __A, double const *__B) { + return (__m128d)__builtin_ia32_loadlpd((__v2df)__A, __B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movemask_pd(__m128d __A) { + return __builtin_ia32_movmskpd((__v2df)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packs_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packsswb128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packs_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packssdw128((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packus_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_packuswb128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpckldq128((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_epi8(__m128i __A, __m128i __B) { + return (__m128i)((__v16qu)__A + (__v16qu)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_epi16(__m128i __A, __m128i __B) { + return (__m128i)((__v8hu)__A + (__v8hu)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_epi32(__m128i __A, __m128i __B) { + return (__m128i)((__v4su)__A + (__v4su)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_epi64(__m128i __A, __m128i __B) { + return (__m128i)((__v2du)__A + (__v2du)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddsb128((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddsw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddusb128((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_paddusw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_epi8(__m128i __A, __m128i __B) { + return (__m128i)((__v16qu)__A - (__v16qu)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_epi16(__m128i __A, __m128i __B) { + return (__m128i)((__v8hu)__A - (__v8hu)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_epi32(__m128i __A, __m128i __B) { + return (__m128i)((__v4su)__A - (__v4su)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_epi64(__m128i __A, __m128i __B) { + return (__m128i)((__v2du)__A - (__v2du)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubsb128((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubsw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubusb128((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psubusw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_madd_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhi_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mullo_epi16(__m128i __A, __m128i __B) { + return (__m128i)((__v8hu)__A * (__v8hu)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_su32(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pmuludq((__v2si)__A, (__v2si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmuludq128((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_epi16(__m128i __A, int __B) { + return (__m128i)__builtin_ia32_psllwi128((__v8hi)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_epi32(__m128i __A, int __B) { + return (__m128i)__builtin_ia32_pslldi128((__v4si)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_epi64(__m128i __A, int __B) { + return (__m128i)__builtin_ia32_psllqi128((__v2di)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srai_epi16(__m128i __A, int __B) { + return (__m128i)__builtin_ia32_psrawi128((__v8hi)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srai_epi32(__m128i __A, int __B) { + return (__m128i)__builtin_ia32_psradi128((__v4si)__A, __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_bsrli_si128(__m128i __A, const int __N) { + return (__m128i)__builtin_ia32_psrldqi128(__A, __N * 8); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_bslli_si128(__m128i __A, const int __N) { + return (__m128i)__builtin_ia32_pslldqi128(__A, __N * 8); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_si128(__m128i __A, const int __N) { + return (__m128i)__builtin_ia32_psrldqi128(__A, __N * 8); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_si128(__m128i __A, const int __N) { + return (__m128i)__builtin_ia32_pslldqi128(__A, __N * 8); +} +#else +#define _mm_bsrli_si128(A, N) \ + ((__m128i)__builtin_ia32_psrldqi128((__m128i)(A), (int)(N)*8)) +#define _mm_bslli_si128(A, N) \ + ((__m128i)__builtin_ia32_pslldqi128((__m128i)(A), (int)(N)*8)) +#define _mm_srli_si128(A, N) \ + ((__m128i)__builtin_ia32_psrldqi128((__m128i)(A), (int)(N)*8)) +#define _mm_slli_si128(A, N) \ + ((__m128i)__builtin_ia32_pslldqi128((__m128i)(A), (int)(N)*8)) +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_epi16(__m128i __A, int __B) { + return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_epi32(__m128i __A, int __B) { + return (__m128i)__builtin_ia32_psrldi128((__v4si)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_epi64(__m128i __A, int __B) { + return (__m128i)__builtin_ia32_psrlqi128((__v2di)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sra_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psraw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sra_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrad128((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrlw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrld128((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psrlq128((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_and_si128(__m128i __A, __m128i __B) { + return (__m128i)((__v2du)__A & (__v2du)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_andnot_si128(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pandn128((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_si128(__m128i __A, __m128i __B) { + return (__m128i)((__v2du)__A | (__v2du)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_si128(__m128i __A, __m128i __B) { + return (__m128i)((__v2du)__A ^ (__v2du)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi8(__m128i __A, __m128i __B) { + return (__m128i)((__v16qs)__A == (__v16qs)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi16(__m128i __A, __m128i __B) { + return (__m128i)((__v8hi)__A == (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi32(__m128i __A, __m128i __B) { + return (__m128i)((__v4si)__A == (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epi8(__m128i __A, __m128i __B) { + return (__m128i)((__v16qs)__A < (__v16qs)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epi16(__m128i __A, __m128i __B) { + return (__m128i)((__v8hi)__A < (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_epi32(__m128i __A, __m128i __B) { + return (__m128i)((__v4si)__A < (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi8(__m128i __A, __m128i __B) { + return (__m128i)((__v16qs)__A > (__v16qs)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi16(__m128i __A, __m128i __B) { + return (__m128i)((__v8hi)__A > (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi32(__m128i __A, __m128i __B) { + return (__m128i)((__v4si)__A > (__v4si)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi16(__m128i const __A, int const __N) { + return (unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)__A, __N); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi16(__m128i const __A, int const __D, int const __N) { + return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)__A, __D, __N); +} +#else +#define _mm_extract_epi16(A, N) \ + ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(A), \ + (int)(N))) +#define _mm_insert_epi16(A, D, N) \ + ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(A), (int)(D), \ + (int)(N))) +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminsw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pminub128((__v16qi)__A, (__v16qi)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movemask_epi8(__m128i __A) { + return __builtin_ia32_pmovmskb128((__v16qi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhi_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__A, (__v8hi)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shufflehi_epi16(__m128i __A, const int __mask) { + return (__m128i)__builtin_ia32_pshufhw((__v8hi)__A, __mask); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shufflelo_epi16(__m128i __A, const int __mask) { + return (__m128i)__builtin_ia32_pshuflw((__v8hi)__A, __mask); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_epi32(__m128i __A, const int __mask) { + return (__m128i)__builtin_ia32_pshufd((__v4si)__A, __mask); +} +#else +#define _mm_shufflehi_epi16(A, N) \ + ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_shufflelo_epi16(A, N) \ + ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_shuffle_epi32(A, N) \ + ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(A), (int)(N))) +#endif + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) { + __builtin_ia32_maskmovdqu((__v16qi)__A, (__v16qi)__B, __C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_avg_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pavgb128((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_avg_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pavgw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sad_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_psadbw128((__v16qi)__A, (__v16qi)__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_si32(int *__A, int __B) { + __builtin_ia32_movnti(__A, __B); +} + +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_si64(long long int *__A, long long int __B) { + __builtin_ia32_movnti64(__A, __B); +} +#endif + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_si128(__m128i *__A, __m128i __B) { + __builtin_ia32_movntdq((__v2di *)__A, (__v2di)__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_pd(double *__A, __m128d __B) { + __builtin_ia32_movntpd(__A, (__v2df)__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_clflush(void const *__A) { + __builtin_ia32_clflush(__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_lfence(void) { + __builtin_ia32_lfence(); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mfence(void) { + __builtin_ia32_mfence(); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi32_si128(int __A) { + return _mm_set_epi32(0, 0, 0, __A); +} + +#ifdef __x86_64__ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_si128(long long __A) { + return _mm_set_epi64x(0, __A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64x_si128(long long __A) { + return _mm_set_epi64x(0, __A); +} +#endif + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castpd_ps(__m128d __A) { + return (__m128)__A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castpd_si128(__m128d __A) { + return (__m128i)__A; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castps_pd(__m128 __A) { + return (__m128d)__A; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castps_si128(__m128 __A) { + return (__m128i)__A; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castsi128_ps(__m128i __A) { + return (__m128)__A; +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_castsi128_pd(__m128i __A) { + return (__m128d)__A; +} + +#ifdef __DISABLE_SSE2__ +#undef __DISABLE_SSE2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE2__ */ + +#endif /* _EMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/f16cintrin.internal.h b/third_party/intel/f16cintrin.internal.h new file mode 100644 index 000000000..e32ee703b --- /dev/null +++ b/third_party/intel/f16cintrin.internal.h @@ -0,0 +1,75 @@ +#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED +#error \ + "Never use directly; include or instead." +#endif + +#ifndef _F16CINTRIN_H_INCLUDED +#define _F16CINTRIN_H_INCLUDED + +#ifndef __F16C__ +#pragma GCC push_options +#pragma GCC target("f16c") +#define __DISABLE_F16C__ +#endif /* __F16C__ */ + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cvtsh_ss(unsigned short __S) { + __v8hi __H = __extension__(__v8hi){(short)__S, 0, 0, 0, 0, 0, 0, 0}; + __v4sf __A = __builtin_ia32_vcvtph2ps(__H); + return __builtin_ia32_vec_ext_v4sf(__A, 0); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtph_ps(__m128i __A) { + return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__A); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtph_ps(__m128i __A) { + return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__A); +} + +#ifdef __OPTIMIZE__ +extern __inline unsigned short + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _cvtss_sh(float __F, const int __I) { + __v4sf __A = __extension__(__v4sf){__F, 0, 0, 0}; + __v8hi __H = __builtin_ia32_vcvtps2ph(__A, __I); + return (unsigned short)__builtin_ia32_vec_ext_v8hi(__H, 0); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_ph(__m128 __A, const int __I) { + return (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__A, __I); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_cvtps_ph(__m256 __A, const int __I) { + return (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__A, __I); +} +#else +#define _cvtss_sh(__F, __I) \ + (__extension__({ \ + __v4sf __A = __extension__(__v4sf){__F, 0, 0, 0}; \ + __v8hi __H = __builtin_ia32_vcvtps2ph(__A, __I); \ + (unsigned short)__builtin_ia32_vec_ext_v8hi(__H, 0); \ + })) + +#define _mm_cvtps_ph(A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)A, (int)(I))) + +#define _mm256_cvtps_ph(A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)A, (int)(I))) +#endif /* __OPTIMIZE */ + +#ifdef __DISABLE_F16C__ +#undef __DISABLE_F16C__ +#pragma GCC pop_options +#endif /* __DISABLE_F16C__ */ + +#endif /* _F16CINTRIN_H_INCLUDED */ diff --git a/third_party/intel/fma4intrin.internal.h b/third_party/intel/fma4intrin.internal.h new file mode 100644 index 000000000..535e63555 --- /dev/null +++ b/third_party/intel/fma4intrin.internal.h @@ -0,0 +1,248 @@ +#ifndef _X86INTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _FMA4INTRIN_H_INCLUDED +#define _FMA4INTRIN_H_INCLUDED + +#include "third_party/intel/ammintrin.internal.h" + +#ifndef __FMA4__ +#pragma GCC push_options +#pragma GCC target("fma4") +#define __DISABLE_FMA4__ +#endif /* __FMA4__ */ + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) + +{ + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, + -(__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, + -(__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, + -(__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, + -(__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, + -(__v2df)__C); +} + +/* 256b Floating point multiply/add type instructions. */ +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) + +{ + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, + -(__v8sf)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, + -(__v4df)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, + -(__v8sf)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, + -(__v4df)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, + -(__v8sf)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, + -(__v4df)__C); +} + +#ifdef __DISABLE_FMA4__ +#undef __DISABLE_FMA4__ +#pragma GCC pop_options +#endif /* __DISABLE_FMA4__ */ + +#endif diff --git a/third_party/intel/fmaintrin.internal.h b/third_party/intel/fmaintrin.internal.h new file mode 100644 index 000000000..bba5306c0 --- /dev/null +++ b/third_party/intel/fmaintrin.internal.h @@ -0,0 +1,241 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _FMAINTRIN_H_INCLUDED +#define _FMAINTRIN_H_INCLUDED + +#ifndef __FMA__ +#pragma GCC push_options +#pragma GCC target("fma") +#define __DISABLE_FMA__ +#endif /* __FMA__ */ + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmsubsd3((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmsubss3((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfnmaddsd3((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfnmaddss3((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfnmsubsd3((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfnmsubss3((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) { + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, + -(__v2df)__C); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) { + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, + -(__v4df)__C); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) { + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) { + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, + -(__v8sf)__C); +} + +#ifdef __DISABLE_FMA__ +#undef __DISABLE_FMA__ +#pragma GCC pop_options +#endif /* __DISABLE_FMA__ */ + +#endif diff --git a/third_party/intel/fxsrintrin.internal.h b/third_party/intel/fxsrintrin.internal.h new file mode 100644 index 000000000..28fad84d3 --- /dev/null +++ b/third_party/intel/fxsrintrin.internal.h @@ -0,0 +1,45 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _FXSRINTRIN_H_INCLUDED +#define _FXSRINTRIN_H_INCLUDED + +#ifndef __FXSR__ +#pragma GCC push_options +#pragma GCC target("fxsr") +#define __DISABLE_FXSR__ +#endif /* __FXSR__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _fxsave(void *__P) { + __builtin_ia32_fxsave(__P); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _fxrstor(void *__P) { + __builtin_ia32_fxrstor(__P); +} + +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _fxsave64(void *__P) { + __builtin_ia32_fxsave64(__P); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _fxrstor64(void *__P) { + __builtin_ia32_fxrstor64(__P); +} +#endif + +#ifdef __DISABLE_FXSR__ +#undef __DISABLE_FXSR__ +#pragma GCC pop_options +#endif /* __DISABLE_FXSR__ */ + +#endif /* _FXSRINTRIN_H_INCLUDED */ diff --git a/third_party/intel/gfniintrin.internal.h b/third_party/intel/gfniintrin.internal.h new file mode 100644 index 000000000..e19512c4c --- /dev/null +++ b/third_party/intel/gfniintrin.internal.h @@ -0,0 +1,344 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _GFNIINTRIN_H_INCLUDED +#define _GFNIINTRIN_H_INCLUDED + +#if !defined(__GFNI__) || !defined(__SSE2__) +#pragma GCC push_options +#pragma GCC target("gfni,sse2") +#define __DISABLE_GFNI__ +#endif /* __GFNI__ */ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_gf2p8mul_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi((__v16qi)__A, (__v16qi)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_gf2p8affineinv_epi64_epi8(__m128i __A, __m128i __B, const int __C) { + return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)__A, + (__v16qi)__B, __C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_gf2p8affine_epi64_epi8(__m128i __A, __m128i __B, const int __C) { + return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)__A, + (__v16qi)__B, __C); +} +#else +#define _mm_gf2p8affineinv_epi64_epi8(A, B, C) \ + ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi( \ + (__v16qi)(__m128i)(A), (__v16qi)(__m128i)(B), (int)(C))) +#define _mm_gf2p8affine_epi64_epi8(A, B, C) \ + ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi( \ + (__v16qi)(__m128i)(A), (__v16qi)(__m128i)(B), (int)(C))) +#endif + +#ifdef __DISABLE_GFNI__ +#undef __DISABLE_GFNI__ +#pragma GCC pop_options +#endif /* __DISABLE_GFNI__ */ + +#if !defined(__GFNI__) || !defined(__AVX__) +#pragma GCC push_options +#pragma GCC target("gfni,avx") +#define __DISABLE_GFNIAVX__ +#endif /* __GFNIAVX__ */ + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi((__v32qi)__A, (__v32qi)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_gf2p8affineinv_epi64_epi8(__m256i __A, __m256i __B, const int __C) { + return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)__A, + (__v32qi)__B, __C); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_gf2p8affine_epi64_epi8(__m256i __A, __m256i __B, const int __C) { + return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)__A, + (__v32qi)__B, __C); +} +#else +#define _mm256_gf2p8affineinv_epi64_epi8(A, B, C) \ + ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi( \ + (__v32qi)(__m256i)(A), (__v32qi)(__m256i)(B), (int)(C))) +#define _mm256_gf2p8affine_epi64_epi8(A, B, C) \ + ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi( \ + (__v32qi)(__m256i)(A), (__v32qi)(__m256i)(B), (int)(C))) +#endif + +#ifdef __DISABLE_GFNIAVX__ +#undef __DISABLE_GFNIAVX__ +#pragma GCC pop_options +#endif /* __GFNIAVX__ */ + +#if !defined(__GFNI__) || !defined(__AVX512VL__) +#pragma GCC push_options +#pragma GCC target("gfni,avx512vl") +#define __DISABLE_GFNIAVX512VL__ +#endif /* __GFNIAVX512VL__ */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_mask_gf2p8mul_epi8(__m128i __A, __mmask16 __B, __m128i __C, __m128i __D) { + return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi_mask( + (__v16qi)__C, (__v16qi)__D, (__v16qi)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_gf2p8mul_epi8(__mmask16 __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi_mask( + (__v16qi)__B, (__v16qi)__C, (__v16qi)_mm_setzero_si128(), __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_gf2p8affineinv_epi64_epi8(__m128i __A, __mmask16 __B, __m128i __C, + __m128i __D, const int __E) { + return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( + (__v16qi)__C, (__v16qi)__D, __E, (__v16qi)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_gf2p8affineinv_epi64_epi8(__mmask16 __A, __m128i __B, __m128i __C, + const int __D) { + return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( + (__v16qi)__B, (__v16qi)__C, __D, (__v16qi)_mm_setzero_si128(), __A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_gf2p8affine_epi64_epi8(__m128i __A, __mmask16 __B, __m128i __C, + __m128i __D, const int __E) { + return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( + (__v16qi)__C, (__v16qi)__D, __E, (__v16qi)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskz_gf2p8affine_epi64_epi8(__mmask16 __A, __m128i __B, __m128i __C, + const int __D) { + return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( + (__v16qi)__B, (__v16qi)__C, __D, (__v16qi)_mm_setzero_si128(), __A); +} +#else +#define _mm_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \ + ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( \ + (__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), \ + (__v16qi)(__m128i)(A), (__mmask16)(B))) +#define _mm_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \ + ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( \ + (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), \ + (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)(A))) +#define _mm_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \ + ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( \ + (__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), \ + (__v16qi)(__m128i)(A), (__mmask16)(B))) +#define _mm_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \ + ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( \ + (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), \ + (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)(A))) +#endif + +#ifdef __DISABLE_GFNIAVX512VL__ +#undef __DISABLE_GFNIAVX512VL__ +#pragma GCC pop_options +#endif /* __GFNIAVX512VL__ */ + +#if !defined(__GFNI__) || !defined(__AVX512VL__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("gfni,avx512vl,avx512bw") +#define __DISABLE_GFNIAVX512VLBW__ +#endif /* __GFNIAVX512VLBW__ */ + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_gf2p8mul_epi8(__m256i __A, __mmask32 __B, __m256i __C, + __m256i __D) { + return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi_mask( + (__v32qi)__C, (__v32qi)__D, (__v32qi)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_gf2p8mul_epi8(__mmask32 __A, __m256i __B, __m256i __C) { + return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi_mask( + (__v32qi)__B, (__v32qi)__C, (__v32qi)_mm256_setzero_si256(), __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_gf2p8affineinv_epi64_epi8(__m256i __A, __mmask32 __B, + __m256i __C, __m256i __D, + const int __E) { + return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( + (__v32qi)__C, (__v32qi)__D, __E, (__v32qi)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_gf2p8affineinv_epi64_epi8(__mmask32 __A, __m256i __B, + __m256i __C, const int __D) { + return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( + (__v32qi)__B, (__v32qi)__C, __D, (__v32qi)_mm256_setzero_si256(), __A); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_mask_gf2p8affine_epi64_epi8(__m256i __A, __mmask32 __B, __m256i __C, + __m256i __D, const int __E) { + return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( + (__v32qi)__C, (__v32qi)__D, __E, (__v32qi)__A, __B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_maskz_gf2p8affine_epi64_epi8(__mmask32 __A, __m256i __B, __m256i __C, + const int __D) { + return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( + (__v32qi)__B, (__v32qi)__C, __D, (__v32qi)_mm256_setzero_si256(), __A); +} +#else +#define _mm256_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \ + ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( \ + (__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), \ + (__v32qi)(__m256i)(A), (__mmask32)(B))) +#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \ + ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( \ + (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), \ + (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)(A))) +#define _mm256_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \ + ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( \ + (__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), \ + (__v32qi)(__m256i)(A), (__mmask32)(B))) +#define _mm256_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \ + ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( \ + (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), \ + (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)(A))) +#endif + +#ifdef __DISABLE_GFNIAVX512VLBW__ +#undef __DISABLE_GFNIAVX512VLBW__ +#pragma GCC pop_options +#endif /* __GFNIAVX512VLBW__ */ + +#if !defined(__GFNI__) || !defined(__AVX512F__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("gfni,avx512f,avx512bw") +#define __DISABLE_GFNIAVX512FBW__ +#endif /* __GFNIAVX512FBW__ */ + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_gf2p8mul_epi8(__m512i __A, __mmask64 __B, __m512i __C, + __m512i __D) { + return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi_mask( + (__v64qi)__C, (__v64qi)__D, (__v64qi)__A, __B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_gf2p8mul_epi8(__mmask64 __A, __m512i __B, __m512i __C) { + return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi_mask( + (__v64qi)__B, (__v64qi)__C, (__v64qi)_mm512_setzero_si512(), __A); +} +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi((__v64qi)__A, (__v64qi)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_gf2p8affineinv_epi64_epi8(__m512i __A, __mmask64 __B, + __m512i __C, __m512i __D, + const int __E) { + return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( + (__v64qi)__C, (__v64qi)__D, __E, (__v64qi)__A, __B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_gf2p8affineinv_epi64_epi8(__mmask64 __A, __m512i __B, + __m512i __C, const int __D) { + return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( + (__v64qi)__B, (__v64qi)__C, __D, (__v64qi)_mm512_setzero_si512(), __A); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_gf2p8affineinv_epi64_epi8(__m512i __A, __m512i __B, const int __C) { + return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)__A, + (__v64qi)__B, __C); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_mask_gf2p8affine_epi64_epi8(__m512i __A, __mmask64 __B, __m512i __C, + __m512i __D, const int __E) { + return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( + (__v64qi)__C, (__v64qi)__D, __E, (__v64qi)__A, __B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_maskz_gf2p8affine_epi64_epi8(__mmask64 __A, __m512i __B, __m512i __C, + const int __D) { + return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( + (__v64qi)__B, (__v64qi)__C, __D, (__v64qi)_mm512_setzero_si512(), __A); +} +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_gf2p8affine_epi64_epi8(__m512i __A, __m512i __B, const int __C) { + return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)__A, + (__v64qi)__B, __C); +} +#else +#define _mm512_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \ + ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( \ + (__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), \ + (__v64qi)(__m512i)(A), (__mmask64)(B))) +#define _mm512_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \ + ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( \ + (__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), \ + (__v64qi)(__m512i)_mm512_setzero_si512(), (__mmask64)(A))) +#define _mm512_gf2p8affineinv_epi64_epi8(A, B, C) \ + ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi( \ + (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C))) +#define _mm512_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \ + ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( \ + (__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), \ + (__v64qi)(__m512i)(A), (__mmask64)(B))) +#define _mm512_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \ + ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( \ + (__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), \ + (__v64qi)(__m512i)_mm512_setzero_si512(), (__mmask64)(A))) +#define _mm512_gf2p8affine_epi64_epi8(A, B, C) \ + ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi( \ + (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C))) +#endif + +#ifdef __DISABLE_GFNIAVX512FBW__ +#undef __DISABLE_GFNIAVX512FBW__ +#pragma GCC pop_options +#endif /* __GFNIAVX512FBW__ */ + +#endif /* _GFNIINTRIN_H_INCLUDED */ diff --git a/third_party/intel/ia32intrin.internal.h b/third_party/intel/ia32intrin.internal.h new file mode 100644 index 000000000..c20edb72b --- /dev/null +++ b/third_party/intel/ia32intrin.internal.h @@ -0,0 +1,239 @@ +#ifndef _X86INTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bsfd(int __X) { + return __builtin_ctz(__X); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bsrd(int __X) { + return __builtin_ia32_bsrsi(__X); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bswapd(int __X) { + return __builtin_bswap32(__X); +} + +#ifndef __iamcu__ + +#ifndef __SSE4_2__ +#pragma GCC push_options +#pragma GCC target("sse4.2") +#define __DISABLE_SSE4_2__ +#endif /* __SSE4_2__ */ + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __crc32b(unsigned int __C, unsigned char __V) { + return __builtin_ia32_crc32qi(__C, __V); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __crc32w(unsigned int __C, unsigned short __V) { + return __builtin_ia32_crc32hi(__C, __V); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __crc32d(unsigned int __C, unsigned int __V) { + return __builtin_ia32_crc32si(__C, __V); +} + +#ifdef __DISABLE_SSE4_2__ +#undef __DISABLE_SSE4_2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE4_2__ */ + +#endif /* __iamcu__ */ + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __popcntd(unsigned int __X) { + return __builtin_popcount(__X); +} + +#ifndef __iamcu__ + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rdpmc(int __S) { + return __builtin_ia32_rdpmc(__S); +} + +#endif /* __iamcu__ */ + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rdtsc(void) { + return __builtin_ia32_rdtsc(); +} + +#ifndef __iamcu__ + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rdtscp(unsigned int *__A) { + return __builtin_ia32_rdtscp(__A); +} + +#endif /* __iamcu__ */ + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rolb(unsigned char __X, int __C) { + return __builtin_ia32_rolqi(__X, __C); +} + +extern __inline unsigned short + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rolw(unsigned short __X, int __C) { + return __builtin_ia32_rolhi(__X, __C); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rold(unsigned int __X, int __C) { + __C &= 31; + return (__X << __C) | (__X >> (-__C & 31)); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rorb(unsigned char __X, int __C) { + return __builtin_ia32_rorqi(__X, __C); +} + +extern __inline unsigned short + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rorw(unsigned short __X, int __C) { + return __builtin_ia32_rorhi(__X, __C); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rord(unsigned int __X, int __C) { + __C &= 31; + return (__X >> __C) | (__X << (-__C & 31)); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __pause(void) { + __builtin_ia32_pause(); +} + +#ifdef __x86_64__ + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bsfq(long long __X) { + return __builtin_ctzll(__X); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bsrq(long long __X) { + return __builtin_ia32_bsrdi(__X); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bswapq(long long __X) { + return __builtin_bswap64(__X); +} + +#ifndef __SSE4_2__ +#pragma GCC push_options +#pragma GCC target("sse4.2") +#define __DISABLE_SSE4_2__ +#endif /* __SSE4_2__ */ + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __crc32q(unsigned long long __C, unsigned long long __V) { + return __builtin_ia32_crc32di(__C, __V); +} + +#ifdef __DISABLE_SSE4_2__ +#undef __DISABLE_SSE4_2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE4_2__ */ + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __popcntq(unsigned long long __X) { + return __builtin_popcountll(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rolq(unsigned long long __X, int __C) { + __C &= 63; + return (__X << __C) | (__X >> (-__C & 63)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __rorq(unsigned long long __X, int __C) { + __C &= 63; + return (__X >> __C) | (__X << (-__C & 63)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __readeflags(void) { + return __builtin_ia32_readeflags_u64(); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __writeeflags(unsigned long long __X) { + __builtin_ia32_writeeflags_u64(__X); +} + +#define _bswap64(a) __bswapq(a) +#define _popcnt64(a) __popcntq(a) +#else + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __readeflags(void) { + return __builtin_ia32_readeflags_u32(); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __writeeflags(unsigned int __X) { + __builtin_ia32_writeeflags_u32(__X); +} + +#endif + +#ifdef __LP64__ +#define _lrotl(a, b) __rolq((a), (b)) +#define _lrotr(a, b) __rorq((a), (b)) +#else +#define _lrotl(a, b) __rold((a), (b)) +#define _lrotr(a, b) __rord((a), (b)) +#endif + +#define _bit_scan_forward(a) __bsfd(a) +#define _bit_scan_reverse(a) __bsrd(a) +#define _bswap(a) __bswapd(a) +#define _popcnt32(a) __popcntd(a) +#ifndef __iamcu__ +#define _rdpmc(a) __rdpmc(a) +#define _rdtscp(a) __rdtscp(a) +#endif /* __iamcu__ */ +#define _rdtsc() __rdtsc() +#define _rotwl(a, b) __rolw((a), (b)) +#define _rotwr(a, b) __rorw((a), (b)) +#define _rotl(a, b) __rold((a), (b)) +#define _rotr(a, b) __rord((a), (b)) diff --git a/third_party/intel/immintrin.internal.h b/third_party/intel/immintrin.internal.h new file mode 100644 index 000000000..e8e5ec77d --- /dev/null +++ b/third_party/intel/immintrin.internal.h @@ -0,0 +1,209 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#define _IMMINTRIN_H_INCLUDED + +/* clang-format off */ +#include "third_party/intel/mmintrin.internal.h" +#include "third_party/intel/xmmintrin.internal.h" +#include "third_party/intel/emmintrin.internal.h" +#include "third_party/intel/pmmintrin.internal.h" +#include "third_party/intel/tmmintrin.internal.h" +#include "third_party/intel/smmintrin.internal.h" +#include "third_party/intel/wmmintrin.internal.h" +#include "third_party/intel/fxsrintrin.internal.h" +#include "third_party/intel/xsaveintrin.internal.h" +#include "third_party/intel/xsaveoptintrin.internal.h" +#include "third_party/intel/xsavesintrin.internal.h" +#include "third_party/intel/xsavecintrin.internal.h" +#include "third_party/intel/avxintrin.internal.h" +#include "third_party/intel/avx2intrin.internal.h" +#include "third_party/intel/avx512fintrin.internal.h" +#include "third_party/intel/avx512erintrin.internal.h" +#include "third_party/intel/avx512pfintrin.internal.h" +#include "third_party/intel/avx512cdintrin.internal.h" +#include "third_party/intel/avx512vlintrin.internal.h" +#include "third_party/intel/avx512bwintrin.internal.h" +#include "third_party/intel/avx512dqintrin.internal.h" +#include "third_party/intel/avx512vlbwintrin.internal.h" +#include "third_party/intel/avx512vldqintrin.internal.h" +#include "third_party/intel/avx512ifmaintrin.internal.h" +#include "third_party/intel/avx512ifmavlintrin.internal.h" +#include "third_party/intel/avx512vbmiintrin.internal.h" +#include "third_party/intel/avx512vbmivlintrin.internal.h" +#include "third_party/intel/avx5124fmapsintrin.internal.h" +#include "third_party/intel/avx5124vnniwintrin.internal.h" +#include "third_party/intel/avx512vpopcntdqintrin.internal.h" +#include "third_party/intel/avx512vbmi2intrin.internal.h" +#include "third_party/intel/avx512vbmi2vlintrin.internal.h" +#include "third_party/intel/avx512vnniintrin.internal.h" +#include "third_party/intel/avx512vnnivlintrin.internal.h" +#include "third_party/intel/avx512vpopcntdqvlintrin.internal.h" +#include "third_party/intel/avx512bitalgintrin.internal.h" +#include "third_party/intel/shaintrin.internal.h" +#include "third_party/intel/lzcntintrin.internal.h" +#include "third_party/intel/bmiintrin.internal.h" +#include "third_party/intel/bmi2intrin.internal.h" +#include "third_party/intel/fmaintrin.internal.h" +#include "third_party/intel/f16cintrin.internal.h" +#include "third_party/intel/rtmintrin.internal.h" +#include "third_party/intel/xtestintrin.internal.h" +#include "third_party/intel/cetintrin.internal.h" +#include "third_party/intel/gfniintrin.internal.h" +#include "third_party/intel/vaesintrin.internal.h" +#include "third_party/intel/vpclmulqdqintrin.internal.h" +#include "third_party/intel/movdirintrin.internal.h" +#include "third_party/intel/sgxintrin.internal.h" +#include "third_party/intel/pconfigintrin.internal.h" +#include "third_party/intel/waitpkgintrin.internal.h" +#include "third_party/intel/cldemoteintrin.internal.h" +#include "third_party/intel/rdseedintrin.internal.h" +#include "third_party/intel/prfchwintrin.internal.h" +#include "third_party/intel/adxintrin.internal.h" +#include "third_party/intel/clwbintrin.internal.h" +#include "third_party/intel/clflushoptintrin.internal.h" +#include "third_party/intel/wbnoinvdintrin.internal.h" +#include "third_party/intel/pkuintrin.internal.h" +/* clang-format on */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _wbinvd(void) { + __builtin_ia32_wbinvd(); +} + +#ifndef __RDRND__ +#pragma GCC push_options +#pragma GCC target("rdrnd") +#define __DISABLE_RDRND__ +#endif /* __RDRND__ */ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _rdrand16_step(unsigned short *__P) { + return __builtin_ia32_rdrand16_step(__P); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _rdrand32_step(unsigned int *__P) { + return __builtin_ia32_rdrand32_step(__P); +} +#ifdef __DISABLE_RDRND__ +#undef __DISABLE_RDRND__ +#pragma GCC pop_options +#endif /* __DISABLE_RDRND__ */ + +#ifndef __RDPID__ +#pragma GCC push_options +#pragma GCC target("rdpid") +#define __DISABLE_RDPID__ +#endif /* __RDPID__ */ +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _rdpid_u32(void) { + return __builtin_ia32_rdpid(); +} +#ifdef __DISABLE_RDPID__ +#undef __DISABLE_RDPID__ +#pragma GCC pop_options +#endif /* __DISABLE_RDPID__ */ + +#ifdef __x86_64__ + +#ifndef __FSGSBASE__ +#pragma GCC push_options +#pragma GCC target("fsgsbase") +#define __DISABLE_FSGSBASE__ +#endif /* __FSGSBASE__ */ +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _readfsbase_u32(void) { + return __builtin_ia32_rdfsbase32(); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _readfsbase_u64(void) { + return __builtin_ia32_rdfsbase64(); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _readgsbase_u32(void) { + return __builtin_ia32_rdgsbase32(); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _readgsbase_u64(void) { + return __builtin_ia32_rdgsbase64(); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _writefsbase_u32(unsigned int __B) { + __builtin_ia32_wrfsbase32(__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _writefsbase_u64(unsigned long long __B) { + __builtin_ia32_wrfsbase64(__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _writegsbase_u32(unsigned int __B) { + __builtin_ia32_wrgsbase32(__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _writegsbase_u64(unsigned long long __B) { + __builtin_ia32_wrgsbase64(__B); +} +#ifdef __DISABLE_FSGSBASE__ +#undef __DISABLE_FSGSBASE__ +#pragma GCC pop_options +#endif /* __DISABLE_FSGSBASE__ */ + +#ifndef __RDRND__ +#pragma GCC push_options +#pragma GCC target("rdrnd") +#define __DISABLE_RDRND__ +#endif /* __RDRND__ */ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _rdrand64_step(unsigned long long *__P) { + return __builtin_ia32_rdrand64_step(__P); +} +#ifdef __DISABLE_RDRND__ +#undef __DISABLE_RDRND__ +#pragma GCC pop_options +#endif /* __DISABLE_RDRND__ */ + +#endif /* __x86_64__ */ + +#ifndef __PTWRITE__ +#pragma GCC push_options +#pragma GCC target("ptwrite") +#define __DISABLE_PTWRITE__ +#endif + +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ptwrite64(unsigned long long __B) { + __builtin_ia32_ptwrite64(__B); +} +#endif /* __x86_64__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _ptwrite32(unsigned __B) { + __builtin_ia32_ptwrite32(__B); +} +#ifdef __DISABLE_PTWRITE__ +#undef __DISABLE_PTWRITE__ +#pragma GCC pop_options +#endif /* __DISABLE_PTWRITE__ */ + +#endif /* _IMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/intel.mk b/third_party/intel/intel.mk new file mode 100644 index 000000000..0984ca613 --- /dev/null +++ b/third_party/intel/intel.mk @@ -0,0 +1,6 @@ +#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐ +#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘ + +PKGS += THIRD_PARTY_INTEL +THIRD_PARTY_INTEL_HDRS = $(filter %.h,$(THIRD_PARTY_INTEL_FILES)) +THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*) diff --git a/third_party/intel/lwpintrin.internal.h b/third_party/intel/lwpintrin.internal.h new file mode 100644 index 000000000..58324b3a3 --- /dev/null +++ b/third_party/intel/lwpintrin.internal.h @@ -0,0 +1,82 @@ +#ifndef _X86INTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _LWPINTRIN_H_INCLUDED +#define _LWPINTRIN_H_INCLUDED + +#ifndef __LWP__ +#pragma GCC push_options +#pragma GCC target("lwp") +#define __DISABLE_LWP__ +#endif /* __LWP__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __llwpcb(void *__pcbAddress) { + __builtin_ia32_llwpcb(__pcbAddress); +} + +extern __inline void *__attribute__((__gnu_inline__, __always_inline__, + __artificial__)) __slwpcb(void) { + return __builtin_ia32_slwpcb(); +} + +#ifdef __OPTIMIZE__ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +__lwpval32(unsigned int __data2, unsigned int __data1, unsigned int __flags) { + __builtin_ia32_lwpval32(__data2, __data1, __flags); +} + +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lwpval64(unsigned long long __data2, unsigned int __data1, + unsigned int __flags) { + __builtin_ia32_lwpval64(__data2, __data1, __flags); +} +#endif +#else +#define __lwpval32(D2, D1, F) \ + (__builtin_ia32_lwpval32((unsigned int)(D2), (unsigned int)(D1), \ + (unsigned int)(F))) +#ifdef __x86_64__ +#define __lwpval64(D2, D1, F) \ + (__builtin_ia32_lwpval64((unsigned long long)(D2), (unsigned int)(D1), \ + (unsigned int)(F))) +#endif +#endif + +#ifdef __OPTIMIZE__ +extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +__lwpins32(unsigned int __data2, unsigned int __data1, unsigned int __flags) { + return __builtin_ia32_lwpins32(__data2, __data1, __flags); +} + +#ifdef __x86_64__ +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lwpins64(unsigned long long __data2, unsigned int __data1, + unsigned int __flags) { + return __builtin_ia32_lwpins64(__data2, __data1, __flags); +} +#endif +#else +#define __lwpins32(D2, D1, F) \ + (__builtin_ia32_lwpins32((unsigned int)(D2), (unsigned int)(D1), \ + (unsigned int)(F))) +#ifdef __x86_64__ +#define __lwpins64(D2, D1, F) \ + (__builtin_ia32_lwpins64((unsigned long long)(D2), (unsigned int)(D1), \ + (unsigned int)(F))) +#endif +#endif + +#ifdef __DISABLE_LWP__ +#undef __DISABLE_LWP__ +#pragma GCC pop_options +#endif /* __DISABLE_LWP__ */ + +#endif /* _LWPINTRIN_H_INCLUDED */ diff --git a/third_party/intel/lzcntintrin.internal.h b/third_party/intel/lzcntintrin.internal.h new file mode 100644 index 000000000..30b01ec8b --- /dev/null +++ b/third_party/intel/lzcntintrin.internal.h @@ -0,0 +1,51 @@ +#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _LZCNTINTRIN_H_INCLUDED +#define _LZCNTINTRIN_H_INCLUDED + +#ifndef __LZCNT__ +#pragma GCC push_options +#pragma GCC target("lzcnt") +#define __DISABLE_LZCNT__ +#endif /* __LZCNT__ */ + +extern __inline unsigned short + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lzcnt16(unsigned short __X) { + return __builtin_ia32_lzcnt_u16(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lzcnt32(unsigned int __X) { + return __builtin_ia32_lzcnt_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _lzcnt_u32(unsigned int __X) { + return __builtin_ia32_lzcnt_u32(__X); +} + +#ifdef __x86_64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __lzcnt64(unsigned long long __X) { + return __builtin_ia32_lzcnt_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _lzcnt_u64(unsigned long long __X) { + return __builtin_ia32_lzcnt_u64(__X); +} +#endif + +#ifdef __DISABLE_LZCNT__ +#undef __DISABLE_LZCNT__ +#pragma GCC pop_options +#endif /* __DISABLE_LZCNT__ */ + +#endif /* _LZCNTINTRIN_H_INCLUDED */ diff --git a/third_party/intel/mm3dnow.internal.h b/third_party/intel/mm3dnow.internal.h new file mode 100644 index 000000000..021d97cf3 --- /dev/null +++ b/third_party/intel/mm3dnow.internal.h @@ -0,0 +1,209 @@ +#ifndef _MM3DNOW_H_INCLUDED +#define _MM3DNOW_H_INCLUDED +#include "third_party/intel/mmintrin.internal.h" +#include "third_party/intel/prfchwintrin.internal.h" + +#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW__ +#pragma GCC push_options +#ifdef __x86_64__ +#pragma GCC target("sse,3dnow") +#else +#pragma GCC target("3dnow") +#endif +#define __DISABLE_3dNOW__ +#endif /* __3dNOW__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_femms(void) { + __builtin_ia32_femms(); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pavgusb(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pavgusb((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pf2id(__m64 __A) { + return (__m64)__builtin_ia32_pf2id((__v2sf)__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfacc(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfacc((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfadd(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfadd((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfcmpeq(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfcmpge(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfcmpge((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfcmpgt(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfmax(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfmax((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfmin(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfmin((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfmul(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfmul((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfrcp(__m64 __A) { + return (__m64)__builtin_ia32_pfrcp((__v2sf)__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfrcpit1(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfrcpit2(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfrsqrt(__m64 __A) { + return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfrsqit1(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfsub(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfsub((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfsubr(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfsubr((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pi2fd(__m64 __A) { + return (__m64)__builtin_ia32_pi2fd((__v2si)__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmulhrw(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pmulhrw((__v4hi)__A, (__v4hi)__B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_prefetch(void *__P) { + __builtin_prefetch(__P, 0, 3 /* _MM_HINT_T0 */); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_from_float(float __A) { + return __extension__(__m64)(__v2sf){__A, 0.0f}; +} + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_to_float(__m64 __A) { + union { + __v2sf v; + float a[2]; + } __tmp; + __tmp.v = (__v2sf)__A; + return __tmp.a[0]; +} + +#ifdef __DISABLE_3dNOW__ +#undef __DISABLE_3dNOW__ +#pragma GCC pop_options +#endif /* __DISABLE_3dNOW__ */ + +#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW_A__ +#pragma GCC push_options +#ifdef __x86_64__ +#pragma GCC target("sse,3dnowa") +#else +#pragma GCC target("3dnowa") +#endif +#define __DISABLE_3dNOW_A__ +#endif /* __3dNOW_A__ */ + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pf2iw(__m64 __A) { + return (__m64)__builtin_ia32_pf2iw((__v2sf)__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfnacc(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfnacc((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pfpnacc(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pfpnacc((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pi2fw(__m64 __A) { + return (__m64)__builtin_ia32_pi2fw((__v2si)__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pswapd(__m64 __A) { + return (__m64)__builtin_ia32_pswapdsf((__v2sf)__A); +} + +#ifdef __DISABLE_3dNOW_A__ +#undef __DISABLE_3dNOW_A__ +#pragma GCC pop_options +#endif /* __DISABLE_3dNOW_A__ */ + +#endif /* _MM3DNOW_H_INCLUDED */ diff --git a/third_party/intel/mm_malloc.internal.h b/third_party/intel/mm_malloc.internal.h new file mode 100644 index 000000000..b27996135 --- /dev/null +++ b/third_party/intel/mm_malloc.internal.h @@ -0,0 +1,27 @@ +#ifndef _MM_MALLOC_H_INCLUDED +#define _MM_MALLOC_H_INCLUDED +#include "libc/mem/mem.h" + +#ifndef __cplusplus +extern int _mm_posix_memalign(void **, size_t, size_t) +#else +extern "C" int _mm_posix_memalign(void **, size_t, size_t) throw() +#endif + __asm__("posix_memalign"); + +static __inline void *_mm_malloc(size_t __size, size_t __alignment) { + void *__ptr; + if (__alignment == 1) return malloc(__size); + if (__alignment == 2 || (sizeof(void *) == 8 && __alignment == 4)) + __alignment = sizeof(void *); + if (_mm_posix_memalign(&__ptr, __alignment, __size) == 0) + return __ptr; + else + return NULL; +} + +static __inline void _mm_free(void *__ptr) { + free(__ptr); +} + +#endif /* _MM_MALLOC_H_INCLUDED */ diff --git a/third_party/intel/mmintrin.internal.h b/third_party/intel/mmintrin.internal.h new file mode 100644 index 000000000..0cbbf5850 --- /dev/null +++ b/third_party/intel/mmintrin.internal.h @@ -0,0 +1,832 @@ +#ifndef _MMINTRIN_H_INCLUDED +#define _MMINTRIN_H_INCLUDED + +#if defined __x86_64__ && !defined __SSE__ || !defined __MMX__ +#pragma GCC push_options +#ifdef __x86_64__ +#pragma GCC target("sse,mmx") +#else +#pragma GCC target("mmx") +#endif +#define __DISABLE_MMX__ +#endif /* __MMX__ */ + +typedef int __m64 __attribute__((__vector_size__(8), __may_alias__)); + +typedef int __m64_u + __attribute__((__vector_size__(8), __may_alias__, __aligned__(1))); + +typedef int __v2si __attribute__((__vector_size__(8))); +typedef short __v4hi __attribute__((__vector_size__(8))); +typedef char __v8qi __attribute__((__vector_size__(8))); +typedef long long __v1di __attribute__((__vector_size__(8))); +typedef float __v2sf __attribute__((__vector_size__(8))); + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_empty(void) { + __builtin_ia32_emms(); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_empty(void) { + _mm_empty(); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi32_si64(int __i) { + return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_from_int(int __i) { + return _mm_cvtsi32_si64(__i); +} + +#ifdef __x86_64__ + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_from_int64(long long __i) { + return (__m64)__i; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_m64(long long __i) { + return (__m64)__i; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64x_si64(long long __i) { + return (__m64)__i; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pi64x(long long __i) { + return (__m64)__i; +} +#endif + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_si32(__m64 __i) { + return __builtin_ia32_vec_ext_v2si((__v2si)__i, 0); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_to_int(__m64 __i) { + return _mm_cvtsi64_si32(__i); +} + +#ifdef __x86_64__ + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_to_int64(__m64 __i) { + return (long long)__i; +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtm64_si64(__m64 __i) { + return (long long)__i; +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_si64x(__m64 __i) { + return (long long)__i; +} +#endif + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packs_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_packsswb(__m64 __m1, __m64 __m2) { + return _mm_packs_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packs_pi32(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_packssdw(__m64 __m1, __m64 __m2) { + return _mm_packs_pi32(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packs_pu16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_packuswb(__m64 __m1, __m64 __m2) { + return _mm_packs_pu16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpckhbw(__m64 __m1, __m64 __m2) { + return _mm_unpackhi_pi8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpckhwd(__m64 __m1, __m64 __m2) { + return _mm_unpackhi_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpckhdq(__m64 __m1, __m64 __m2) { + return _mm_unpackhi_pi32(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpcklbw(__m64 __m1, __m64 __m2) { + return _mm_unpacklo_pi8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpcklwd(__m64 __m1, __m64 __m2) { + return _mm_unpacklo_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_punpckldq(__m64 __m1, __m64 __m2) { + return _mm_unpacklo_pi32(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddb(__m64 __m1, __m64 __m2) { + return _mm_add_pi8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddw(__m64 __m1, __m64 __m2) { + return _mm_add_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_pi32(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddd(__m64 __m1, __m64 __m2) { + return _mm_add_pi32(__m1, __m2); +} + +#ifndef __SSE2__ +#pragma GCC push_options +#pragma GCC target("sse2,mmx") +#define __DISABLE_SSE2__ +#endif /* __SSE2__ */ + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_si64(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_paddq((__v1di)__m1, (__v1di)__m2); +} +#ifdef __DISABLE_SSE2__ +#undef __DISABLE_SSE2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE2__ */ + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddsb(__m64 __m1, __m64 __m2) { + return _mm_adds_pi8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddsw(__m64 __m1, __m64 __m2) { + return _mm_adds_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_pu8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddusb(__m64 __m1, __m64 __m2) { + return _mm_adds_pu8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_adds_pu16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_paddusw(__m64 __m1, __m64 __m2) { + return _mm_adds_pu16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubb(__m64 __m1, __m64 __m2) { + return _mm_sub_pi8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubw(__m64 __m1, __m64 __m2) { + return _mm_sub_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_pi32(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubd(__m64 __m1, __m64 __m2) { + return _mm_sub_pi32(__m1, __m2); +} + +#ifndef __SSE2__ +#pragma GCC push_options +#pragma GCC target("sse2,mmx") +#define __DISABLE_SSE2__ +#endif /* __SSE2__ */ + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_si64(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_psubq((__v1di)__m1, (__v1di)__m2); +} +#ifdef __DISABLE_SSE2__ +#undef __DISABLE_SSE2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE2__ */ + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubsb(__m64 __m1, __m64 __m2) { + return _mm_subs_pi8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubsw(__m64 __m1, __m64 __m2) { + return _mm_subs_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_pu8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubusb(__m64 __m1, __m64 __m2) { + return _mm_subs_pu8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_subs_pu16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psubusw(__m64 __m1, __m64 __m2) { + return _mm_subs_pu16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_madd_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmaddwd(__m64 __m1, __m64 __m2) { + return _mm_madd_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmulhw(__m64 __m1, __m64 __m2) { + return _mm_mulhi_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mullo_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmullw(__m64 __m1, __m64 __m2) { + return _mm_mullo_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_pi16(__m64 __m, __m64 __count) { + return (__m64)__builtin_ia32_psllw((__v4hi)__m, (__v4hi)__count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psllw(__m64 __m, __m64 __count) { + return _mm_sll_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_pi16(__m64 __m, int __count) { + return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psllwi(__m64 __m, int __count) { + return _mm_slli_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_pi32(__m64 __m, __m64 __count) { + return (__m64)__builtin_ia32_pslld((__v2si)__m, (__v2si)__count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pslld(__m64 __m, __m64 __count) { + return _mm_sll_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_pi32(__m64 __m, int __count) { + return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pslldi(__m64 __m, int __count) { + return _mm_slli_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sll_si64(__m64 __m, __m64 __count) { + return (__m64)__builtin_ia32_psllq((__v1di)__m, (__v1di)__count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psllq(__m64 __m, __m64 __count) { + return _mm_sll_si64(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_slli_si64(__m64 __m, int __count) { + return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psllqi(__m64 __m, int __count) { + return _mm_slli_si64(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sra_pi16(__m64 __m, __m64 __count) { + return (__m64)__builtin_ia32_psraw((__v4hi)__m, (__v4hi)__count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psraw(__m64 __m, __m64 __count) { + return _mm_sra_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srai_pi16(__m64 __m, int __count) { + return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrawi(__m64 __m, int __count) { + return _mm_srai_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sra_pi32(__m64 __m, __m64 __count) { + return (__m64)__builtin_ia32_psrad((__v2si)__m, (__v2si)__count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrad(__m64 __m, __m64 __count) { + return _mm_sra_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srai_pi32(__m64 __m, int __count) { + return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psradi(__m64 __m, int __count) { + return _mm_srai_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_pi16(__m64 __m, __m64 __count) { + return (__m64)__builtin_ia32_psrlw((__v4hi)__m, (__v4hi)__count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrlw(__m64 __m, __m64 __count) { + return _mm_srl_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_pi16(__m64 __m, int __count) { + return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrlwi(__m64 __m, int __count) { + return _mm_srli_pi16(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_pi32(__m64 __m, __m64 __count) { + return (__m64)__builtin_ia32_psrld((__v2si)__m, (__v2si)__count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrld(__m64 __m, __m64 __count) { + return _mm_srl_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_pi32(__m64 __m, int __count) { + return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrldi(__m64 __m, int __count) { + return _mm_srli_pi32(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srl_si64(__m64 __m, __m64 __count) { + return (__m64)__builtin_ia32_psrlq((__v1di)__m, (__v1di)__count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrlq(__m64 __m, __m64 __count) { + return _mm_srl_si64(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_srli_si64(__m64 __m, int __count) { + return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psrlqi(__m64 __m, int __count) { + return _mm_srli_si64(__m, __count); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_and_si64(__m64 __m1, __m64 __m2) { + return __builtin_ia32_pand(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pand(__m64 __m1, __m64 __m2) { + return _mm_and_si64(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_andnot_si64(__m64 __m1, __m64 __m2) { + return __builtin_ia32_pandn(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pandn(__m64 __m1, __m64 __m2) { + return _mm_andnot_si64(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_si64(__m64 __m1, __m64 __m2) { + return __builtin_ia32_por(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_por(__m64 __m1, __m64 __m2) { + return _mm_or_si64(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_si64(__m64 __m1, __m64 __m2) { + return __builtin_ia32_pxor(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pxor(__m64 __m1, __m64 __m2) { + return _mm_xor_si64(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpeqb(__m64 __m1, __m64 __m2) { + return _mm_cmpeq_pi8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpgtb(__m64 __m1, __m64 __m2) { + return _mm_cmpgt_pi8(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpeqw(__m64 __m1, __m64 __m2) { + return _mm_cmpeq_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpgtw(__m64 __m1, __m64 __m2) { + return _mm_cmpgt_pi16(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpeqd(__m64 __m1, __m64 __m2) { + return _mm_cmpeq_pi32(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pcmpgtd(__m64 __m1, __m64 __m2) { + return _mm_cmpgt_pi32(__m1, __m2); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setzero_si64(void) { + return (__m64)0LL; +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pi32(int __i1, int __i0) { + return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { + return (__m64)__builtin_ia32_vec_init_v4hi(__w0, __w1, __w2, __w3); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, + char __b2, char __b1, char __b0) { + return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3, __b4, __b5, + __b6, __b7); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_pi32(int __i0, int __i1) { + return _mm_set_pi32(__i1, __i0); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { + return _mm_set_pi16(__w3, __w2, __w1, __w0); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, + char __b5, char __b6, char __b7) { + return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_pi32(int __i) { + return _mm_set_pi32(__i, __i); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_pi16(short __w) { + return _mm_set_pi16(__w, __w, __w, __w); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_pi8(char __b) { + return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); +} +#ifdef __DISABLE_MMX__ +#undef __DISABLE_MMX__ +#pragma GCC pop_options +#endif /* __DISABLE_MMX__ */ + +#endif /* _MMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/movdirintrin.internal.h b/third_party/intel/movdirintrin.internal.h new file mode 100644 index 000000000..72baaca64 --- /dev/null +++ b/third_party/intel/movdirintrin.internal.h @@ -0,0 +1,48 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _MOVDIRINTRIN_H_INCLUDED +#define _MOVDIRINTRIN_H_INCLUDED + +#ifndef __MOVDIRI__ +#pragma GCC push_options +#pragma GCC target("movdiri") +#define __DISABLE_MOVDIRI__ +#endif /* __MOVDIRI__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _directstoreu_u32(void *__P, unsigned int __A) { + __builtin_ia32_directstoreu_u32((unsigned int *)__P, __A); +} +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _directstoreu_u64(void *__P, unsigned long long __A) { + __builtin_ia32_directstoreu_u64((unsigned long long *)__P, __A); +} +#endif + +#ifdef __DISABLE_MOVDIRI__ +#undef __DISABLE_MOVDIRI__ +#pragma GCC pop_options +#endif /* __DISABLE_MOVDIRI__ */ + +#ifndef __MOVDIR64B__ +#pragma GCC push_options +#pragma GCC target("movdir64b") +#define __DISABLE_MOVDIR64B__ +#endif /* __MOVDIR64B__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _movdir64b(void *__P, const void *__Q) { + __builtin_ia32_movdir64b(__P, __Q); +} + +#ifdef __DISABLE_MOVDIR64B__ +#undef __DISABLE_MOVDIR64B__ +#pragma GCC pop_options +#endif /* __DISABLE_MOVDIR64B__ */ +#endif /* _MOVDIRINTRIN_H_INCLUDED. */ diff --git a/third_party/intel/mwaitxintrin.internal.h b/third_party/intel/mwaitxintrin.internal.h new file mode 100644 index 000000000..18813288d --- /dev/null +++ b/third_party/intel/mwaitxintrin.internal.h @@ -0,0 +1,27 @@ +#ifndef _MWAITXINTRIN_H_INCLUDED +#define _MWAITXINTRIN_H_INCLUDED + +#ifndef __MWAITX__ +#pragma GCC push_options +#pragma GCC target("mwaitx") +#define __DISABLE_MWAITX__ +#endif /* __MWAITX__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_monitorx(void const* __P, unsigned int __E, unsigned int __H) { + __builtin_ia32_monitorx(__P, __E, __H); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mwaitx(unsigned int __E, unsigned int __H, unsigned int __C) { + __builtin_ia32_mwaitx(__E, __H, __C); +} + +#ifdef __DISABLE_MWAITX__ +#undef __DISABLE_MWAITX__ +#pragma GCC pop_options +#endif /* __DISABLE_MWAITX__ */ + +#endif /* _MWAITXINTRIN_H_INCLUDED */ diff --git a/third_party/intel/nmmintrin.internal.h b/third_party/intel/nmmintrin.internal.h new file mode 100644 index 000000000..f65998a30 --- /dev/null +++ b/third_party/intel/nmmintrin.internal.h @@ -0,0 +1,4 @@ +#ifndef _NMMINTRIN_H_INCLUDED +#define _NMMINTRIN_H_INCLUDED +#include "third_party/intel/smmintrin.internal.h" +#endif /* _NMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/pconfigintrin.internal.h b/third_party/intel/pconfigintrin.internal.h new file mode 100644 index 000000000..a38d9195a --- /dev/null +++ b/third_party/intel/pconfigintrin.internal.h @@ -0,0 +1,54 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _PCONFIGINTRIN_H_INCLUDED +#define _PCONFIGINTRIN_H_INCLUDED + +#ifndef __PCONFIG__ +#pragma GCC push_options +#pragma GCC target("pconfig") +#define __DISABLE_PCONFIG__ +#endif /* __PCONFIG__ */ + +#define __pconfig_b(leaf, b, retval) \ + __asm__ __volatile__("pconfig\n\t" \ + : "=a"(retval) \ + : "a"(leaf), "b"(b) \ + : "c" \ + "c") + +#define __pconfig_generic(leaf, b, c, d, retval) \ + __asm__ __volatile__("pconfig\n\t" \ + : "=a"(retval), "=b"(b), "=c"(c), "=d"(d) \ + : "a"(leaf), "b"(b), "c"(c), "d"(d) \ + : "cc") + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pconfig_u32(const unsigned int __L, size_t __D[]) { + enum __pconfig_type { + __PCONFIG_KEY_PROGRAM = 0x01, + }; + + unsigned int __R = 0; + + if (!__builtin_constant_p(__L)) + __pconfig_generic(__L, __D[0], __D[1], __D[2], __R); + else + switch (__L) { + case __PCONFIG_KEY_PROGRAM: + __pconfig_b(__L, __D[0], __R); + break; + default: + __pconfig_generic(__L, __D[0], __D[1], __D[2], __R); + } + return __R; +} + +#ifdef __DISABLE_PCONFIG__ +#undef __DISABLE_PCONFIG__ +#pragma GCC pop_options +#endif /* __DISABLE_PCONFIG__ */ + +#endif /* _PCONFIGINTRIN_H_INCLUDED */ diff --git a/third_party/intel/pkuintrin.internal.h b/third_party/intel/pkuintrin.internal.h new file mode 100644 index 000000000..b2175da76 --- /dev/null +++ b/third_party/intel/pkuintrin.internal.h @@ -0,0 +1,31 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _PKUINTRIN_H_INCLUDED +#define _PKUINTRIN_H_INCLUDED + +#ifndef __PKU__ +#pragma GCC push_options +#pragma GCC target("pku") +#define __DISABLE_PKU__ +#endif /* __PKU__ */ + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _rdpkru_u32(void) { + return __builtin_ia32_rdpkru(); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _wrpkru(unsigned int __key) { + __builtin_ia32_wrpkru(__key); +} + +#ifdef __DISABLE_PKU__ +#undef __DISABLE_PKU__ +#pragma GCC pop_options +#endif /* __DISABLE_PKU__ */ + +#endif /* _PKUINTRIN_H_INCLUDED */ diff --git a/third_party/intel/pmmintrin.internal.h b/third_party/intel/pmmintrin.internal.h new file mode 100644 index 000000000..6c1d078af --- /dev/null +++ b/third_party/intel/pmmintrin.internal.h @@ -0,0 +1,102 @@ +#ifndef _PMMINTRIN_H_INCLUDED +#define _PMMINTRIN_H_INCLUDED +#include "third_party/intel/emmintrin.internal.h" + +#ifndef __SSE3__ +#pragma GCC push_options +#pragma GCC target("sse3") +#define __DISABLE_SSE3__ +#endif /* __SSE3__ */ + +#define _MM_DENORMALS_ZERO_MASK 0x0040 +#define _MM_DENORMALS_ZERO_ON 0x0040 +#define _MM_DENORMALS_ZERO_OFF 0x0000 + +#define _MM_SET_DENORMALS_ZERO_MODE(mode) \ + _mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (mode)) +#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_addsub_ps(__m128 __X, __m128 __Y) { + return (__m128)__builtin_ia32_addsubps((__v4sf)__X, (__v4sf)__Y); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_ps(__m128 __X, __m128 __Y) { + return (__m128)__builtin_ia32_haddps((__v4sf)__X, (__v4sf)__Y); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_ps(__m128 __X, __m128 __Y) { + return (__m128)__builtin_ia32_hsubps((__v4sf)__X, (__v4sf)__Y); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movehdup_ps(__m128 __X) { + return (__m128)__builtin_ia32_movshdup((__v4sf)__X); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_moveldup_ps(__m128 __X) { + return (__m128)__builtin_ia32_movsldup((__v4sf)__X); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_addsub_pd(__m128d __X, __m128d __Y) { + return (__m128d)__builtin_ia32_addsubpd((__v2df)__X, (__v2df)__Y); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_pd(__m128d __X, __m128d __Y) { + return (__m128d)__builtin_ia32_haddpd((__v2df)__X, (__v2df)__Y); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_pd(__m128d __X, __m128d __Y) { + return (__m128d)__builtin_ia32_hsubpd((__v2df)__X, (__v2df)__Y); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loaddup_pd(double const *__P) { + return _mm_load1_pd(__P); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movedup_pd(__m128d __X) { + return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0)); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_lddqu_si128(__m128i const *__P) { + return (__m128i)__builtin_ia32_lddqu((char const *)__P); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_monitor(void const *__P, unsigned int __E, unsigned int __H) { + __builtin_ia32_monitor(__P, __E, __H); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mwait(unsigned int __E, unsigned int __H) { + __builtin_ia32_mwait(__E, __H); +} + +#ifdef __DISABLE_SSE3__ +#undef __DISABLE_SSE3__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE3__ */ + +#endif /* _PMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/popcntintrin.internal.h b/third_party/intel/popcntintrin.internal.h new file mode 100644 index 000000000..77e05951a --- /dev/null +++ b/third_party/intel/popcntintrin.internal.h @@ -0,0 +1,29 @@ +#ifndef _POPCNTINTRIN_H_INCLUDED +#define _POPCNTINTRIN_H_INCLUDED + +#ifndef __POPCNT__ +#pragma GCC push_options +#pragma GCC target("popcnt") +#define __DISABLE_POPCNT__ +#endif /* __POPCNT__ */ + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_popcnt_u32(unsigned int __X) { + return __builtin_popcount(__X); +} + +#ifdef __x86_64__ +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_popcnt_u64(unsigned long long __X) { + return __builtin_popcountll(__X); +} +#endif + +#ifdef __DISABLE_POPCNT__ +#undef __DISABLE_POPCNT__ +#pragma GCC pop_options +#endif /* __DISABLE_POPCNT__ */ + +#endif /* _POPCNTINTRIN_H_INCLUDED */ diff --git a/third_party/intel/prfchwintrin.internal.h b/third_party/intel/prfchwintrin.internal.h new file mode 100644 index 000000000..8d727e960 --- /dev/null +++ b/third_party/intel/prfchwintrin.internal.h @@ -0,0 +1,15 @@ +#if !defined _IMMINTRIN_H_INCLUDED && !defined _MM3DNOW_H_INCLUDED +#error \ + "Never use directly; include or instead." +#endif + +#ifndef _PRFCHWINTRIN_H_INCLUDED +#define _PRFCHWINTRIN_H_INCLUDED + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_prefetchw(void *__P) { + __builtin_prefetch(__P, 1, 3 /* _MM_HINT_T0 */); +} + +#endif /* _PRFCHWINTRIN_H_INCLUDED */ diff --git a/third_party/intel/rdseedintrin.internal.h b/third_party/intel/rdseedintrin.internal.h new file mode 100644 index 000000000..6096637bc --- /dev/null +++ b/third_party/intel/rdseedintrin.internal.h @@ -0,0 +1,39 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _RDSEEDINTRIN_H_INCLUDED +#define _RDSEEDINTRIN_H_INCLUDED + +#ifndef __RDSEED__ +#pragma GCC push_options +#pragma GCC target("rdseed") +#define __DISABLE_RDSEED__ +#endif /* __RDSEED__ */ + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _rdseed16_step(unsigned short *__p) { + return __builtin_ia32_rdseed_hi_step(__p); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _rdseed32_step(unsigned int *__p) { + return __builtin_ia32_rdseed_si_step(__p); +} + +#ifdef __x86_64__ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _rdseed64_step(unsigned long long *__p) { + return __builtin_ia32_rdseed_di_step(__p); +} +#endif + +#ifdef __DISABLE_RDSEED__ +#undef __DISABLE_RDSEED__ +#pragma GCC pop_options +#endif /* __DISABLE_RDSEED__ */ + +#endif /* _RDSEEDINTRIN_H_INCLUDED */ diff --git a/third_party/intel/rtmintrin.internal.h b/third_party/intel/rtmintrin.internal.h new file mode 100644 index 000000000..010588e60 --- /dev/null +++ b/third_party/intel/rtmintrin.internal.h @@ -0,0 +1,50 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _RTMINTRIN_H_INCLUDED +#define _RTMINTRIN_H_INCLUDED + +#ifndef __RTM__ +#pragma GCC push_options +#pragma GCC target("rtm") +#define __DISABLE_RTM__ +#endif /* __RTM__ */ + +#define _XBEGIN_STARTED (~0u) +#define _XABORT_EXPLICIT (1 << 0) +#define _XABORT_RETRY (1 << 1) +#define _XABORT_CONFLICT (1 << 2) +#define _XABORT_CAPACITY (1 << 3) +#define _XABORT_DEBUG (1 << 4) +#define _XABORT_NESTED (1 << 5) +#define _XABORT_CODE(x) (((x) >> 24) & 0xFF) + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xbegin(void) { + return __builtin_ia32_xbegin(); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xend(void) { + __builtin_ia32_xend(); +} + +#ifdef __OPTIMIZE__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xabort(const unsigned int __imm) { + __builtin_ia32_xabort(__imm); +} +#else +#define _xabort(N) __builtin_ia32_xabort(N) +#endif /* __OPTIMIZE__ */ + +#ifdef __DISABLE_RTM__ +#undef __DISABLE_RTM__ +#pragma GCC pop_options +#endif /* __DISABLE_RTM__ */ + +#endif /* _RTMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/sgxintrin.internal.h b/third_party/intel/sgxintrin.internal.h new file mode 100644 index 000000000..f1e3e84b1 --- /dev/null +++ b/third_party/intel/sgxintrin.internal.h @@ -0,0 +1,219 @@ +#ifndef _SGXINTRIN_H_INCLUDED +#define _SGXINTRIN_H_INCLUDED + +#ifndef __SGX__ +#pragma GCC push_options +#pragma GCC target("sgx") +#define __DISABLE_SGX__ +#endif /* __SGX__ */ + +#define __encls_bc(leaf, b, c, retval) \ + __asm__ __volatile__("encls\n\t" \ + : "=a"(retval) \ + : "a"(leaf), "b"(b), "c"(c) \ + : "cc") + +#define __encls_bcd(leaf, b, c, d, retval) \ + __asm__ __volatile__("encls\n\t" \ + : "=a"(retval) \ + : "a"(leaf), "b"(b), "c"(c), "d"(d) \ + : "cc") + +#define __encls_c(leaf, c, retval) \ + __asm__ __volatile__("encls\n\t" : "=a"(retval) : "a"(leaf), "c"(c) : "cc") + +#define __encls_edbgrd(leaf, b, c, retval) \ + __asm__ __volatile__("encls\n\t" : "=a"(retval), "=b"(b) : "a"(leaf), "c"(c)) + +#define __encls_generic(leaf, b, c, d, retval) \ + __asm__ __volatile__("encls\n\t" \ + : "=a"(retval), "=b"(b), "=c"(c), "=d"(d) \ + : "a"(leaf), "b"(b), "c"(c), "d"(d) \ + : "cc") + +#define __enclu_bc(leaf, b, c, retval) \ + __asm__ __volatile__("enclu\n\t" \ + : "=a"(retval) \ + : "a"(leaf), "b"(b), "c"(c) \ + : "cc") + +#define __enclu_bcd(leaf, b, c, d, retval) \ + __asm__ __volatile__("enclu\n\t" \ + : "=a"(retval) \ + : "a"(leaf), "b"(b), "c"(c), "d"(d) \ + : "cc") + +#define __enclu_eenter(leaf, b, c, retval) \ + __asm__ __volatile__("enclu\n\t" \ + : "=a"(retval), "=c"(c) \ + : "a"(leaf), "b"(b), "c"(c) \ + : "cc") + +#define __enclu_eexit(leaf, b, c, retval) \ + __asm__ __volatile__("enclu\n\t" \ + : "=a"(retval), "=c"(c) \ + : "a"(leaf), "b"(b) \ + : "cc") + +#define __enclu_generic(leaf, b, c, d, retval) \ + __asm__ __volatile__("enclu\n\t" \ + : "=a"(retval), "=b"(b), "=c"(c), "=d"(d) \ + : "a"(leaf), "b"(b), "c"(c), "d"(d) \ + : "cc") + +#define __enclv_bc(leaf, b, c, retval) \ + __asm__ __volatile__("enclv\n\t" \ + : "=a"(retval) \ + : "a"(leaf), "b"(b), "c"(c) \ + : "cc") + +#define __enclv_cd(leaf, c, d, retval) \ + __asm__ __volatile__("enclv\n\t" \ + : "=a"(retval) \ + : "a"(leaf), "c"(c), "d"(d) \ + : "cc") + +#define __enclv_generic(leaf, b, c, d, retval) \ + __asm__ __volatile__("enclv\n\t" \ + : "=a"(retval), "=b"(b), "=c"(b), "=d"(d) \ + : "a"(leaf), "b"(b), "c"(c), "d"(d) \ + : "cc") + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _encls_u32(const unsigned int __L, size_t __D[]) { + enum __encls_type { + __SGX_ECREATE = 0x00, + __SGX_EADD = 0x01, + __SGX_EINIT = 0x02, + __SGX_EREMOVE = 0x03, + __SGX_EDBGRD = 0x04, + __SGX_EDBGWR = 0x05, + __SGX_EEXTEND = 0x06, + __SGX_ELDB = 0x07, + __SGX_ELDU = 0x08, + __SGX_EBLOCK = 0x09, + __SGX_EPA = 0x0A, + __SGX_EWB = 0x0B, + __SGX_ETRACK = 0x0C, + __SGX_EAUG = 0x0D, + __SGX_EMODPR = 0x0E, + __SGX_EMODT = 0x0F, + __SGX_ERDINFO = 0x10, + __SGX_ETRACKC = 0x11, + __SGX_ELDBC = 0x12, + __SGX_ELDUC = 0x13 + }; + enum __encls_type __T = (enum __encls_type)__L; + unsigned int __R = 0; + if (!__builtin_constant_p(__T)) + __encls_generic(__L, __D[0], __D[1], __D[2], __R); + else + switch (__T) { + case __SGX_ECREATE: + case __SGX_EADD: + case __SGX_EDBGWR: + case __SGX_EEXTEND: + case __SGX_EPA: + case __SGX_EMODPR: + case __SGX_EMODT: + case __SGX_EAUG: + case __SGX_ERDINFO: + __encls_bc(__L, __D[0], __D[1], __R); + break; + case __SGX_EINIT: + case __SGX_ELDB: + case __SGX_ELDU: + case __SGX_EWB: + case __SGX_ELDBC: + case __SGX_ELDUC: + __encls_bcd(__L, __D[0], __D[1], __D[2], __R); + break; + case __SGX_EREMOVE: + case __SGX_EBLOCK: + case __SGX_ETRACK: + case __SGX_ETRACKC: + __encls_c(__L, __D[1], __R); + break; + case __SGX_EDBGRD: + __encls_edbgrd(__L, __D[0], __D[1], __R); + break; + default: + __encls_generic(__L, __D[0], __D[1], __D[2], __R); + } + return __R; +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _enclu_u32(const unsigned int __L, size_t __D[]) { + enum __enclu_type { + __SGX_EREPORT = 0x00, + __SGX_EGETKEY = 0x01, + __SGX_EENTER = 0x02, + __SGX_ERESUME = 0x03, + __SGX_EEXIT = 0x04, + __SGX_EACCEPT = 0x05, + __SGX_EMODPE = 0x06, + __SGX_EACCEPTCOPY = 0x07 + }; + enum __enclu_type __T = (enum __enclu_type)__L; + unsigned int __R = 0; + if (!__builtin_constant_p(__T)) + __enclu_generic(__L, __D[0], __D[1], __D[2], __R); + else + switch (__T) { + case __SGX_EREPORT: + case __SGX_EACCEPTCOPY: + __enclu_bcd(__L, __D[0], __D[1], __D[2], __R); + break; + case __SGX_EGETKEY: + case __SGX_ERESUME: + case __SGX_EACCEPT: + case __SGX_EMODPE: + __enclu_bc(__L, __D[0], __D[1], __R); + break; + case __SGX_EENTER: + __enclu_eenter(__L, __D[0], __D[1], __R); + break; + case __SGX_EEXIT: + __enclu_eexit(__L, __D[0], __D[1], __R); + break; + default: + __enclu_generic(__L, __D[0], __D[1], __D[2], __R); + } + return __R; +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _enclv_u32(const unsigned int __L, size_t __D[]) { + enum __enclv_type { + __SGX_EDECVIRTCHILD = 0x00, + __SGX_EINCVIRTCHILD = 0x01, + __SGX_ESETCONTEXT = 0x02 + }; + unsigned int __R = 0; + if (!__builtin_constant_p(__L)) + __enclv_generic(__L, __D[0], __D[1], __D[2], __R); + else + switch (__L) { + case __SGX_EDECVIRTCHILD: + case __SGX_EINCVIRTCHILD: + __enclv_bc(__L, __D[0], __D[1], __R); + break; + case __SGX_ESETCONTEXT: + __enclv_cd(__L, __D[1], __D[2], __R); + break; + default: + __enclv_generic(__L, __D[0], __D[1], __D[2], __R); + } + return __R; +} + +#ifdef __DISABLE_SGX__ +#undef __DISABLE_SGX__ +#pragma GCC pop_options +#endif /* __DISABLE_SGX__ */ + +#endif /* _SGXINTRIN_H_INCLUDED */ diff --git a/third_party/intel/shaintrin.internal.h b/third_party/intel/shaintrin.internal.h new file mode 100644 index 000000000..05446f46e --- /dev/null +++ b/third_party/intel/shaintrin.internal.h @@ -0,0 +1,68 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _SHAINTRIN_H_INCLUDED +#define _SHAINTRIN_H_INCLUDED + +#ifndef __SHA__ +#pragma GCC push_options +#pragma GCC target("sha") +#define __DISABLE_SHA__ +#endif /* __SHA__ */ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha1msg1_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_sha1msg1((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha1msg2_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_sha1msg2((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha1nexte_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_sha1nexte((__v4si)__A, (__v4si)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha1rnds4_epu32(__m128i __A, __m128i __B, const int __I) { + return (__m128i)__builtin_ia32_sha1rnds4((__v4si)__A, (__v4si)__B, __I); +} +#else +#define _mm_sha1rnds4_epu32(A, B, I) \ + ((__m128i)__builtin_ia32_sha1rnds4((__v4si)(__m128i)A, (__v4si)(__m128i)B, \ + (int)I)) +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha256msg1_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_sha256msg1((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha256msg2_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_sha256msg2((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha256rnds2_epu32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +#ifdef __DISABLE_SHA__ +#undef __DISABLE_SHA__ +#pragma GCC pop_options +#endif /* __DISABLE_SHA__ */ + +#endif /* _SHAINTRIN_H_INCLUDED */ diff --git a/third_party/intel/smmintrin.internal.h b/third_party/intel/smmintrin.internal.h new file mode 100644 index 000000000..6d7ca9969 --- /dev/null +++ b/third_party/intel/smmintrin.internal.h @@ -0,0 +1,705 @@ +#ifndef _SMMINTRIN_H_INCLUDED +#define _SMMINTRIN_H_INCLUDED +#include "third_party/intel/tmmintrin.internal.h" + +#ifndef __SSE4_1__ +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define __DISABLE_SSE4_1__ +#endif /* __SSE4_1__ */ + +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NO_EXC 0x08 + +#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testz_si128(__m128i __M, __m128i __V) { + return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testc_si128(__m128i __M, __m128i __V) { + return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_testnzc_si128(__m128i __M, __m128i __V) { + return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); +} + +#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) + +#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) + +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_round_pd(__m128d __V, const int __M) { + return (__m128d)__builtin_ia32_roundpd((__v2df)__V, __M); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_round_sd(__m128d __D, __m128d __V, const int __M) { + return (__m128d)__builtin_ia32_roundsd((__v2df)__D, (__v2df)__V, __M); +} +#else +#define _mm_round_pd(V, M) \ + ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(V), (int)(M))) + +#define _mm_round_sd(D, V, M) \ + ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(D), (__v2df)(__m128d)(V), \ + (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_round_ps(__m128 __V, const int __M) { + return (__m128)__builtin_ia32_roundps((__v4sf)__V, __M); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_round_ss(__m128 __D, __m128 __V, const int __M) { + return (__m128)__builtin_ia32_roundss((__v4sf)__D, (__v4sf)__V, __M); +} +#else +#define _mm_round_ps(V, M) \ + ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(V), (int)(M))) + +#define _mm_round_ss(D, V, M) \ + ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(D), (__v4sf)(__m128)(V), \ + (int)(M))) +#endif + +#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL) +#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) +#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR) + +#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL) +#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR) +#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR) + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blend_epi16(__m128i __X, __m128i __Y, const int __M) { + return (__m128i)__builtin_ia32_pblendw128((__v8hi)__X, (__v8hi)__Y, __M); +} +#else +#define _mm_blend_epi16(X, Y, M) \ + ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(X), \ + (__v8hi)(__m128i)(Y), (int)(M))) +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blendv_epi8(__m128i __X, __m128i __Y, __m128i __M) { + return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__X, (__v16qi)__Y, + (__v16qi)__M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blend_ps(__m128 __X, __m128 __Y, const int __M) { + return (__m128)__builtin_ia32_blendps((__v4sf)__X, (__v4sf)__Y, __M); +} +#else +#define _mm_blend_ps(X, Y, M) \ + ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ + (int)(M))) +#endif + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blendv_ps(__m128 __X, __m128 __Y, __m128 __M) { + return (__m128)__builtin_ia32_blendvps((__v4sf)__X, (__v4sf)__Y, (__v4sf)__M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blend_pd(__m128d __X, __m128d __Y, const int __M) { + return (__m128d)__builtin_ia32_blendpd((__v2df)__X, (__v2df)__Y, __M); +} +#else +#define _mm_blend_pd(X, Y, M) \ + ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ + (int)(M))) +#endif + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blendv_pd(__m128d __X, __m128d __Y, __m128d __M) { + return (__m128d)__builtin_ia32_blendvpd((__v2df)__X, (__v2df)__Y, + (__v2df)__M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_dp_ps(__m128 __X, __m128 __Y, const int __M) { + return (__m128)__builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, __M); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_dp_pd(__m128d __X, __m128d __Y, const int __M) { + return (__m128d)__builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, __M); +} +#else +#define _mm_dp_ps(X, Y, M) \ + ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ + (int)(M))) + +#define _mm_dp_pd(X, Y, M) \ + ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \ + (int)(M))) +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_epi64(__m128i __X, __m128i __Y) { + return (__m128i)((__v2di)__X == (__v2di)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epi8(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pminsb128((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epi8(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmaxsb128((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epu16(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pminuw128((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epu16(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmaxuw128((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epi32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pminsd128((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epi32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmaxsd128((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_epu32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pminud128((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_epu32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmaxud128((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mullo_epi32(__m128i __X, __m128i __Y) { + return (__m128i)((__v4su)__X * (__v4su)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_epi32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmuldq128((__v4si)__X, (__v4si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_ps(__m128 __D, __m128 __S, const int __N) { + return (__m128)__builtin_ia32_insertps128((__v4sf)__D, (__v4sf)__S, __N); +} +#else +#define _mm_insert_ps(D, S, N) \ + ((__m128)__builtin_ia32_insertps128((__v4sf)(__m128)(D), \ + (__v4sf)(__m128)(S), (int)(N))) +#endif + +#define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M)) + +#ifdef __OPTIMIZE__ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_ps(__m128 __X, const int __N) { + union { + int i; + float f; + } __tmp; + __tmp.f = __builtin_ia32_vec_ext_v4sf((__v4sf)__X, __N); + return __tmp.i; +} +#else +#define _mm_extract_ps(X, N) \ + (__extension__({ \ + union { \ + int i; \ + float f; \ + } __tmp; \ + __tmp.f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ + __tmp.i; \ + })) +#endif + +#define _MM_EXTRACT_FLOAT(D, S, N) \ + { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(S), (N)); } + +#define _MM_PICK_OUT_PS(X, N) \ + _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi8(__m128i __D, int __S, const int __N) { + return (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)__D, __S, __N); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi32(__m128i __D, int __S, const int __N) { + return (__m128i)__builtin_ia32_vec_set_v4si((__v4si)__D, __S, __N); +} + +#ifdef __x86_64__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_epi64(__m128i __D, long long __S, const int __N) { + return (__m128i)__builtin_ia32_vec_set_v2di((__v2di)__D, __S, __N); +} +#endif +#else +#define _mm_insert_epi8(D, S, N) \ + ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(D), (int)(S), \ + (int)(N))) + +#define _mm_insert_epi32(D, S, N) \ + ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(D), (int)(S), \ + (int)(N))) + +#ifdef __x86_64__ +#define _mm_insert_epi64(D, S, N) \ + ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(D), (long long)(S), \ + (int)(N))) +#endif +#endif + +#ifdef __OPTIMIZE__ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi8(__m128i __X, const int __N) { + return (unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)__X, __N); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi32(__m128i __X, const int __N) { + return __builtin_ia32_vec_ext_v4si((__v4si)__X, __N); +} + +#ifdef __x86_64__ +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi64(__m128i __X, const int __N) { + return __builtin_ia32_vec_ext_v2di((__v2di)__X, __N); +} +#endif +#else +#define _mm_extract_epi8(X, N) \ + ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ + (int)(N))) +#define _mm_extract_epi32(X, N) \ + ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))) + +#ifdef __x86_64__ +#define _mm_extract_epi64(X, N) \ + ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))) +#endif +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_minpos_epu16(__m128i __X) { + return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi8_epi32(__m128i __X) { + return (__m128i)__builtin_ia32_pmovsxbd128((__v16qi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi16_epi32(__m128i __X) { + return (__m128i)__builtin_ia32_pmovsxwd128((__v8hi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi8_epi64(__m128i __X) { + return (__m128i)__builtin_ia32_pmovsxbq128((__v16qi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi32_epi64(__m128i __X) { + return (__m128i)__builtin_ia32_pmovsxdq128((__v4si)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi16_epi64(__m128i __X) { + return (__m128i)__builtin_ia32_pmovsxwq128((__v8hi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepi8_epi16(__m128i __X) { + return (__m128i)__builtin_ia32_pmovsxbw128((__v16qi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu8_epi32(__m128i __X) { + return (__m128i)__builtin_ia32_pmovzxbd128((__v16qi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu16_epi32(__m128i __X) { + return (__m128i)__builtin_ia32_pmovzxwd128((__v8hi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu8_epi64(__m128i __X) { + return (__m128i)__builtin_ia32_pmovzxbq128((__v16qi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu32_epi64(__m128i __X) { + return (__m128i)__builtin_ia32_pmovzxdq128((__v4si)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu16_epi64(__m128i __X) { + return (__m128i)__builtin_ia32_pmovzxwq128((__v8hi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtepu8_epi16(__m128i __X) { + return (__m128i)__builtin_ia32_pmovzxbw128((__v16qi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_packus_epi32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_packusdw128((__v4si)__X, (__v4si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mpsadbw_epu8(__m128i __X, __m128i __Y, const int __M) { + return (__m128i)__builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, __M); +} +#else +#define _mm_mpsadbw_epu8(X, Y, M) \ + ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_load_si128(__m128i *__X) { + return (__m128i)__builtin_ia32_movntdqa((__v2di *)__X); +} + +#ifndef __SSE4_2__ +#pragma GCC push_options +#pragma GCC target("sse4.2") +#define __DISABLE_SSE4_2__ +#endif + +#define _SIDD_UBYTE_OPS 0x00 +#define _SIDD_UWORD_OPS 0x01 +#define _SIDD_SBYTE_OPS 0x02 +#define _SIDD_SWORD_OPS 0x03 + +#define _SIDD_CMP_EQUAL_ANY 0x00 +#define _SIDD_CMP_RANGES 0x04 +#define _SIDD_CMP_EQUAL_EACH 0x08 +#define _SIDD_CMP_EQUAL_ORDERED 0x0c + +#define _SIDD_POSITIVE_POLARITY 0x00 +#define _SIDD_NEGATIVE_POLARITY 0x10 +#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 +#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 + +#define _SIDD_LEAST_SIGNIFICANT 0x00 +#define _SIDD_MOST_SIGNIFICANT 0x40 + +#define _SIDD_BIT_MASK 0x00 +#define _SIDD_UNIT_MASK 0x40 + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpistrm(__m128i __X, __m128i __Y, const int __M) { + return (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)__X, (__v16qi)__Y, __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpistri(__m128i __X, __m128i __Y, const int __M) { + return __builtin_ia32_pcmpistri128((__v16qi)__X, (__v16qi)__Y, __M); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpestrm(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { + return (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)__X, __LX, (__v16qi)__Y, + __LY, __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpestri(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { + return __builtin_ia32_pcmpestri128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, + __M); +} +#else +#define _mm_cmpistrm(X, Y, M) \ + ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistri(X, Y, M) \ + ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) + +#define _mm_cmpestrm(X, LX, Y, LY, M) \ + ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestri(X, LX, Y, LY, M) \ + ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpistra(__m128i __X, __m128i __Y, const int __M) { + return __builtin_ia32_pcmpistria128((__v16qi)__X, (__v16qi)__Y, __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpistrc(__m128i __X, __m128i __Y, const int __M) { + return __builtin_ia32_pcmpistric128((__v16qi)__X, (__v16qi)__Y, __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpistro(__m128i __X, __m128i __Y, const int __M) { + return __builtin_ia32_pcmpistrio128((__v16qi)__X, (__v16qi)__Y, __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpistrs(__m128i __X, __m128i __Y, const int __M) { + return __builtin_ia32_pcmpistris128((__v16qi)__X, (__v16qi)__Y, __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpistrz(__m128i __X, __m128i __Y, const int __M) { + return __builtin_ia32_pcmpistriz128((__v16qi)__X, (__v16qi)__Y, __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpestra(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { + return __builtin_ia32_pcmpestria128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, + __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpestrc(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { + return __builtin_ia32_pcmpestric128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, + __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpestro(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { + return __builtin_ia32_pcmpestrio128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, + __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpestrs(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { + return __builtin_ia32_pcmpestris128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, + __M); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpestrz(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { + return __builtin_ia32_pcmpestriz128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, + __M); +} +#else +#define _mm_cmpistra(X, Y, M) \ + ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrc(X, Y, M) \ + ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistro(X, Y, M) \ + ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrs(X, Y, M) \ + ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrz(X, Y, M) \ + ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) + +#define _mm_cmpestra(X, LX, Y, LY, M) \ + ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestrc(X, LX, Y, LY, M) \ + ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestro(X, LX, Y, LY, M) \ + ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestrs(X, LX, Y, LY, M) \ + ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestrz(X, LX, Y, LY, M) \ + ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_epi64(__m128i __X, __m128i __Y) { + return (__m128i)((__v2di)__X > (__v2di)__Y); +} + +#ifdef __DISABLE_SSE4_2__ +#undef __DISABLE_SSE4_2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE4_2__ */ + +#ifdef __DISABLE_SSE4_1__ +#undef __DISABLE_SSE4_1__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE4_1__ */ + +#include "third_party/intel/popcntintrin.internal.h" + +#ifndef __SSE4_1__ +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define __DISABLE_SSE4_1__ +#endif /* __SSE4_1__ */ + +#ifndef __SSE4_2__ +#pragma GCC push_options +#pragma GCC target("sse4.2") +#define __DISABLE_SSE4_2__ +#endif /* __SSE4_1__ */ + +/* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */ +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_crc32_u8(unsigned int __C, unsigned char __V) { + return __builtin_ia32_crc32qi(__C, __V); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_crc32_u16(unsigned int __C, unsigned short __V) { + return __builtin_ia32_crc32hi(__C, __V); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_crc32_u32(unsigned int __C, unsigned int __V) { + return __builtin_ia32_crc32si(__C, __V); +} + +#ifdef __x86_64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_crc32_u64(unsigned long long __C, unsigned long long __V) { + return __builtin_ia32_crc32di(__C, __V); +} +#endif + +#ifdef __DISABLE_SSE4_2__ +#undef __DISABLE_SSE4_2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE4_2__ */ + +#ifdef __DISABLE_SSE4_1__ +#undef __DISABLE_SSE4_1__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE4_1__ */ + +#endif /* _SMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/tbmintrin.internal.h b/third_party/intel/tbmintrin.internal.h new file mode 100644 index 000000000..a1c93feec --- /dev/null +++ b/third_party/intel/tbmintrin.internal.h @@ -0,0 +1,154 @@ +#ifndef _X86INTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _TBMINTRIN_H_INCLUDED +#define _TBMINTRIN_H_INCLUDED + +#ifndef __TBM__ +#pragma GCC push_options +#pragma GCC target("tbm") +#define __DISABLE_TBM__ +#endif /* __TBM__ */ + +#ifdef __OPTIMIZE__ +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextri_u32(unsigned int __X, const unsigned int __I) { + return __builtin_ia32_bextri_u32(__X, __I); +} +#else +#define __bextri_u32(X, I) \ + ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(X), \ + (unsigned int)(I))) +#endif /*__OPTIMIZE__ */ + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blcfill_u32(unsigned int __X) { + return __X & (__X + 1); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blci_u32(unsigned int __X) { + return __X | ~(__X + 1); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blcic_u32(unsigned int __X) { + return ~__X & (__X + 1); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blcmsk_u32(unsigned int __X) { + return __X ^ (__X + 1); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blcs_u32(unsigned int __X) { + return __X | (__X + 1); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsfill_u32(unsigned int __X) { + return __X | (__X - 1); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsic_u32(unsigned int __X) { + return ~__X | (__X - 1); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __t1mskc_u32(unsigned int __X) { + return ~__X | (__X + 1); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzmsk_u32(unsigned int __X) { + return ~__X & (__X - 1); +} + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextri_u64(unsigned long long __X, const unsigned int __I) { + return __builtin_ia32_bextri_u64(__X, __I); +} +#else +#define __bextri_u64(X, I) \ + ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(X), \ + (unsigned long long)(I))) +#endif /*__OPTIMIZE__ */ + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blcfill_u64(unsigned long long __X) { + return __X & (__X + 1); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blci_u64(unsigned long long __X) { + return __X | ~(__X + 1); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blcic_u64(unsigned long long __X) { + return ~__X & (__X + 1); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blcmsk_u64(unsigned long long __X) { + return __X ^ (__X + 1); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blcs_u64(unsigned long long __X) { + return __X | (__X + 1); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsfill_u64(unsigned long long __X) { + return __X | (__X - 1); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsic_u64(unsigned long long __X) { + return ~__X | (__X - 1); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __t1mskc_u64(unsigned long long __X) { + return ~__X | (__X + 1); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzmsk_u64(unsigned long long __X) { + return ~__X & (__X - 1); +} + +#endif /* __x86_64__ */ + +#ifdef __DISABLE_TBM__ +#undef __DISABLE_TBM__ +#pragma GCC pop_options +#endif /* __DISABLE_TBM__ */ + +#endif /* _TBMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/tmmintrin.internal.h b/third_party/intel/tmmintrin.internal.h new file mode 100644 index 000000000..4632027ec --- /dev/null +++ b/third_party/intel/tmmintrin.internal.h @@ -0,0 +1,217 @@ +#ifndef _TMMINTRIN_H_INCLUDED +#define _TMMINTRIN_H_INCLUDED +#include "third_party/intel/pmmintrin.internal.h" + +#ifndef __SSSE3__ +#pragma GCC push_options +#pragma GCC target("ssse3") +#define __DISABLE_SSSE3__ +#endif /* __SSSE3__ */ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_epi16(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_phaddw128((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_epi32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_phaddd128((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadds_epi16(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_pi16(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_phaddw((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadd_pi32(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_phaddd((__v2si)__X, (__v2si)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hadds_pi16(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_phaddsw((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_epi16(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_phsubw128((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_epi32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_phsubd128((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsubs_epi16(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_pi16(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_phsubw((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsub_pi32(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_phsubd((__v2si)__X, (__v2si)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsubs_pi16(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_phsubsw((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maddubs_epi16(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maddubs_pi16(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__X, (__v8qi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhrs_epi16(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhrs_pi16(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_epi8(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_pshufb128((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_pi8(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_pshufb((__v8qi)__X, (__v8qi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_epi8(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psignb128((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_epi16(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psignw128((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_epi32(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_psignd128((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_pi8(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_psignb((__v8qi)__X, (__v8qi)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_pi16(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_psignw((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sign_pi32(__m64 __X, __m64 __Y) { + return (__m64)__builtin_ia32_psignd((__v2si)__X, (__v2si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) { + return (__m128i)__builtin_ia32_palignr128((__v2di)__X, (__v2di)__Y, __N * 8); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N) { + return (__m64)__builtin_ia32_palignr((__v1di)__X, (__v1di)__Y, __N * 8); +} +#else +#define _mm_alignr_epi8(X, Y, N) \ + ((__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(N)*8)) +#define _mm_alignr_pi8(X, Y, N) \ + ((__m64)__builtin_ia32_palignr((__v1di)(__m64)(X), (__v1di)(__m64)(Y), \ + (int)(N)*8)) +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_epi8(__m128i __X) { + return (__m128i)__builtin_ia32_pabsb128((__v16qi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_epi16(__m128i __X) { + return (__m128i)__builtin_ia32_pabsw128((__v8hi)__X); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_epi32(__m128i __X) { + return (__m128i)__builtin_ia32_pabsd128((__v4si)__X); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_pi8(__m64 __X) { + return (__m64)__builtin_ia32_pabsb((__v8qi)__X); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_pi16(__m64 __X) { + return (__m64)__builtin_ia32_pabsw((__v4hi)__X); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_abs_pi32(__m64 __X) { + return (__m64)__builtin_ia32_pabsd((__v2si)__X); +} + +#ifdef __DISABLE_SSSE3__ +#undef __DISABLE_SSSE3__ +#pragma GCC pop_options +#endif /* __DISABLE_SSSE3__ */ + +#endif /* _TMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/vaesintrin.internal.h b/third_party/intel/vaesintrin.internal.h new file mode 100644 index 000000000..604cb9ddd --- /dev/null +++ b/third_party/intel/vaesintrin.internal.h @@ -0,0 +1,75 @@ +#ifndef __VAESINTRIN_H_INCLUDED +#define __VAESINTRIN_H_INCLUDED +#include "third_party/intel/x86intrin.internal.h" + +#if !defined(__VAES__) || !defined(__AVX__) +#pragma GCC push_options +#pragma GCC target("vaes,avx") +#define __DISABLE_VAES__ +#endif /* __VAES__ */ + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_aesdec_epi128(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vaesdec_v32qi((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_aesdeclast_epi128(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vaesdeclast_v32qi((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_aesenc_epi128(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vaesenc_v32qi((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_aesenclast_epi128(__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_vaesenclast_v32qi((__v32qi)__A, (__v32qi)__B); +} + +#ifdef __DISABLE_VAES__ +#undef __DISABLE_VAES__ +#pragma GCC pop_options +#endif /* __DISABLE_VAES__ */ + +#if !defined(__VAES__) || !defined(__AVX512F__) +#pragma GCC push_options +#pragma GCC target("vaes,avx512f") +#define __DISABLE_VAESF__ +#endif /* __VAES__ */ + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_aesdec_epi128(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_vaesdec_v64qi((__v64qi)__A, (__v64qi)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_aesdeclast_epi128(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_vaesdeclast_v64qi((__v64qi)__A, (__v64qi)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_aesenc_epi128(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_vaesenc_v64qi((__v64qi)__A, (__v64qi)__B); +} + +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_aesenclast_epi128(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_vaesenclast_v64qi((__v64qi)__A, (__v64qi)__B); +} + +#ifdef __DISABLE_VAESF__ +#undef __DISABLE_VAESF__ +#pragma GCC pop_options +#endif /* __DISABLE_VAES__ */ + +#endif /* __VAESINTRIN_H_INCLUDED */ diff --git a/third_party/intel/vpclmulqdqintrin.internal.h b/third_party/intel/vpclmulqdqintrin.internal.h new file mode 100644 index 000000000..f988bd18a --- /dev/null +++ b/third_party/intel/vpclmulqdqintrin.internal.h @@ -0,0 +1,54 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _VPCLMULQDQINTRIN_H_INCLUDED +#define _VPCLMULQDQINTRIN_H_INCLUDED + +#if !defined(__VPCLMULQDQ__) || !defined(__AVX512F__) +#pragma GCC push_options +#pragma GCC target("vpclmulqdq,avx512f") +#define __DISABLE_VPCLMULQDQF__ +#endif /* __VPCLMULQDQF__ */ + +#ifdef __OPTIMIZE__ +extern __inline __m512i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_clmulepi64_epi128(__m512i __A, __m512i __B, const int __C) { + return (__m512i)__builtin_ia32_vpclmulqdq_v8di((__v8di)__A, (__v8di)__B, __C); +} +#else +#define _mm512_clmulepi64_epi128(A, B, C) \ + ((__m512i)__builtin_ia32_vpclmulqdq_v8di((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(C))) +#endif + +#ifdef __DISABLE_VPCLMULQDQF__ +#undef __DISABLE_VPCLMULQDQF__ +#pragma GCC pop_options +#endif /* __DISABLE_VPCLMULQDQF__ */ + +#if !defined(__VPCLMULQDQ__) || !defined(__AVX__) +#pragma GCC push_options +#pragma GCC target("vpclmulqdq,avx") +#define __DISABLE_VPCLMULQDQ__ +#endif /* __VPCLMULQDQ__ */ + +#ifdef __OPTIMIZE__ +extern __inline __m256i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_clmulepi64_epi128(__m256i __A, __m256i __B, const int __C) { + return (__m256i)__builtin_ia32_vpclmulqdq_v4di((__v4di)__A, (__v4di)__B, __C); +} +#else +#define _mm256_clmulepi64_epi128(A, B, C) \ + ((__m256i)__builtin_ia32_vpclmulqdq_v4di((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(C))) +#endif + +#ifdef __DISABLE_VPCLMULQDQ__ +#undef __DISABLE_VPCLMULQDQ__ +#pragma GCC pop_options +#endif /* __DISABLE_VPCLMULQDQ__ */ + +#endif /* _VPCLMULQDQINTRIN_H_INCLUDED */ diff --git a/third_party/intel/waitpkgintrin.internal.h b/third_party/intel/waitpkgintrin.internal.h new file mode 100644 index 000000000..1a659070a --- /dev/null +++ b/third_party/intel/waitpkgintrin.internal.h @@ -0,0 +1,37 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _WAITPKG_H_INCLUDED +#define _WAITPKG_H_INCLUDED + +#ifndef __WAITPKG__ +#pragma GCC push_options +#pragma GCC target("waitpkg") +#define __DISABLE_WAITPKG__ +#endif /* __WAITPKG__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _umonitor(void *__A) { + __builtin_ia32_umonitor(__A); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _umwait(unsigned int __A, unsigned long long __B) { + return __builtin_ia32_umwait(__A, __B); +} + +extern __inline unsigned char + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tpause(unsigned int __A, unsigned long long __B) { + return __builtin_ia32_tpause(__A, __B); +} + +#ifdef __DISABLE_WAITPKG__ +#undef __DISABLE_WAITPKG__ +#pragma GCC pop_options +#endif /* __DISABLE_WAITPKG__ */ + +#endif /* _WAITPKG_H_INCLUDED. */ diff --git a/third_party/intel/wbnoinvdintrin.internal.h b/third_party/intel/wbnoinvdintrin.internal.h new file mode 100644 index 000000000..9e78f8abf --- /dev/null +++ b/third_party/intel/wbnoinvdintrin.internal.h @@ -0,0 +1,25 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _WBNOINVDINTRIN_H_INCLUDED +#define _WBNOINVDINTRIN_H_INCLUDED + +#ifndef __WBNOINVD__ +#pragma GCC push_options +#pragma GCC target("wbnoinvd") +#define __DISABLE_WBNOINVD__ +#endif /* __WBNOINVD__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _wbnoinvd(void) { + __builtin_ia32_wbnoinvd(); +} + +#ifdef __DISABLE_WBNOINVD__ +#undef __DISABLE_WBNOINVD__ +#pragma GCC pop_options +#endif /* __DISABLE_WBNOINVD__ */ + +#endif /* _WBNOINVDINTRIN_H_INCLUDED */ diff --git a/third_party/intel/wmmintrin.internal.h b/third_party/intel/wmmintrin.internal.h new file mode 100644 index 000000000..a24d67767 --- /dev/null +++ b/third_party/intel/wmmintrin.internal.h @@ -0,0 +1,80 @@ +#ifndef _WMMINTRIN_H_INCLUDED +#define _WMMINTRIN_H_INCLUDED +#include "third_party/intel/emmintrin.internal.h" + +#if !defined(__AES__) || !defined(__SSE2__) +#pragma GCC push_options +#pragma GCC target("aes,sse2") +#define __DISABLE_AES__ +#endif /* __AES__ */ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_aesdec_si128(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_aesdec128((__v2di)__X, (__v2di)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_aesdeclast_si128(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__X, (__v2di)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_aesenc_si128(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_aesenc128((__v2di)__X, (__v2di)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_aesenclast_si128(__m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_aesenclast128((__v2di)__X, (__v2di)__Y); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_aesimc_si128(__m128i __X) { + return (__m128i)__builtin_ia32_aesimc128((__v2di)__X); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_aeskeygenassist_si128(__m128i __X, const int __C) { + return (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)__X, __C); +} +#else +#define _mm_aeskeygenassist_si128(X, C) \ + ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(X), (int)(C))) +#endif + +#ifdef __DISABLE_AES__ +#undef __DISABLE_AES__ +#pragma GCC pop_options +#endif /* __DISABLE_AES__ */ + +#if !defined(__PCLMUL__) || !defined(__SSE2__) +#pragma GCC push_options +#pragma GCC target("pclmul,sse2") +#define __DISABLE_PCLMUL__ +#endif /* __PCLMUL__ */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I) { + return (__m128i)__builtin_ia32_pclmulqdq128((__v2di)__X, (__v2di)__Y, __I); +} +#else +#define _mm_clmulepi64_si128(X, Y, I) \ + ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(I))) +#endif + +#ifdef __DISABLE_PCLMUL__ +#undef __DISABLE_PCLMUL__ +#pragma GCC pop_options +#endif /* __DISABLE_PCLMUL__ */ + +#endif /* _WMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/x86intrin.internal.h b/third_party/intel/x86intrin.internal.h new file mode 100644 index 000000000..fa7071511 --- /dev/null +++ b/third_party/intel/x86intrin.internal.h @@ -0,0 +1,19 @@ +#ifndef _X86INTRIN_H_INCLUDED +#define _X86INTRIN_H_INCLUDED +#include "third_party/intel/ia32intrin.internal.h" + +#ifndef __iamcu__ +/* clang-format off */ +#include "third_party/intel/immintrin.internal.h" +#include "third_party/intel/mm3dnow.internal.h" +#include "third_party/intel/fma4intrin.internal.h" +#include "third_party/intel/xopintrin.internal.h" +#include "third_party/intel/lwpintrin.internal.h" +#include "third_party/intel/tbmintrin.internal.h" +#include "third_party/intel/popcntintrin.internal.h" +#include "third_party/intel/mwaitxintrin.internal.h" +#include "third_party/intel/clzerointrin.internal.h" +/* clang-format on */ +#endif /* __iamcu__ */ + +#endif /* _X86INTRIN_H_INCLUDED */ diff --git a/third_party/intel/xmmintrin.internal.h b/third_party/intel/xmmintrin.internal.h new file mode 100644 index 000000000..e07a38b32 --- /dev/null +++ b/third_party/intel/xmmintrin.internal.h @@ -0,0 +1,1090 @@ +#ifndef _XMMINTRIN_H_INCLUDED +#define _XMMINTRIN_H_INCLUDED +#include "third_party/intel/mm_malloc.internal.h" +#include "third_party/intel/mmintrin.internal.h" + +enum _mm_hint { + _MM_HINT_ET0 = 7, + _MM_HINT_ET1 = 6, + _MM_HINT_T0 = 3, + _MM_HINT_T1 = 2, + _MM_HINT_T2 = 1, + _MM_HINT_NTA = 0 +}; + +#ifdef __OPTIMIZE__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_prefetch(const void *__P, enum _mm_hint __I) { + __builtin_prefetch(__P, (__I & 0x4) >> 2, __I & 0x3); +} +#else +#define _mm_prefetch(P, I) __builtin_prefetch((P), ((I & 0x4) >> 2), (I & 0x3)) +#endif + +#ifndef __SSE__ +#pragma GCC push_options +#pragma GCC target("sse") +#define __DISABLE_SSE__ +#endif /* __SSE__ */ + +typedef float __m128 __attribute__((__vector_size__(16), __may_alias__)); + +typedef float __m128_u + __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); + +typedef float __v4sf __attribute__((__vector_size__(16))); + +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) + +#define _MM_EXCEPT_MASK 0x003f +#define _MM_EXCEPT_INVALID 0x0001 +#define _MM_EXCEPT_DENORM 0x0002 +#define _MM_EXCEPT_DIV_ZERO 0x0004 +#define _MM_EXCEPT_OVERFLOW 0x0008 +#define _MM_EXCEPT_UNDERFLOW 0x0010 +#define _MM_EXCEPT_INEXACT 0x0020 + +#define _MM_MASK_MASK 0x1f80 +#define _MM_MASK_INVALID 0x0080 +#define _MM_MASK_DENORM 0x0100 +#define _MM_MASK_DIV_ZERO 0x0200 +#define _MM_MASK_OVERFLOW 0x0400 +#define _MM_MASK_UNDERFLOW 0x0800 +#define _MM_MASK_INEXACT 0x1000 + +#define _MM_ROUND_MASK 0x6000 +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 + +#define _MM_FLUSH_ZERO_MASK 0x8000 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_FLUSH_ZERO_OFF 0x0000 + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_undefined_ps(void) { + __m128 __Y = __Y; + return __Y; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setzero_ps(void) { + return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f}; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_addss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_subss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_mulss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_divss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_ss(__m128 __A) { + return (__m128)__builtin_ia32_sqrtss((__v4sf)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp_ss(__m128 __A) { + return (__m128)__builtin_ia32_rcpss((__v4sf)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt_ss(__m128 __A) { + return (__m128)__builtin_ia32_rsqrtss((__v4sf)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_minss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_maxss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_add_ps(__m128 __A, __m128 __B) { + return (__m128)((__v4sf)__A + (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sub_ps(__m128 __A, __m128 __B) { + return (__m128)((__v4sf)__A - (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mul_ps(__m128 __A, __m128 __B) { + return (__m128)((__v4sf)__A * (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_div_ps(__m128 __A, __m128 __B) { + return (__m128)((__v4sf)__A / (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sqrt_ps(__m128 __A) { + return (__m128)__builtin_ia32_sqrtps((__v4sf)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rcp_ps(__m128 __A) { + return (__m128)__builtin_ia32_rcpps((__v4sf)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rsqrt_ps(__m128 __A) { + return (__m128)__builtin_ia32_rsqrtps((__v4sf)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_minps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_maxps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_and_ps(__m128 __A, __m128 __B) { + return __builtin_ia32_andps(__A, __B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_andnot_ps(__m128 __A, __m128 __B) { + return __builtin_ia32_andnps(__A, __B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_or_ps(__m128 __A, __m128 __B) { + return __builtin_ia32_orps(__A, __B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_xor_ps(__m128 __A, __m128 __B) { + return __builtin_ia32_xorps(__A, __B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpeqss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpltss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpless((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_movss( + (__v4sf)__A, (__v4sf)__builtin_ia32_cmpltss((__v4sf)__B, (__v4sf)__A)); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_movss( + (__v4sf)__A, (__v4sf)__builtin_ia32_cmpless((__v4sf)__B, (__v4sf)__A)); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpneqss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnlt_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpnltss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnle_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpnless((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpngt_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_movss( + (__v4sf)__A, (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__B, (__v4sf)__A)); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnge_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_movss( + (__v4sf)__A, (__v4sf)__builtin_ia32_cmpnless((__v4sf)__B, (__v4sf)__A)); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpord_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpordss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpunord_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpunordss((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpeq_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpeqps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmplt_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpltps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmple_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpleps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpgt_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpgtps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpge_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpgeps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpneq_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpneqps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnlt_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpnltps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnle_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpnleps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpngt_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpngtps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpnge_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpngeps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpord_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpordps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmpunord_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_cmpunordps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comieq_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_comieq((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comilt_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_comilt((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comile_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_comile((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comigt_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_comigt((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comige_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_comige((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comineq_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_comineq((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomieq_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_ucomieq((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomilt_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_ucomilt((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomile_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_ucomile((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomigt_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_ucomigt((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomige_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_ucomige((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_ucomineq_ss(__m128 __A, __m128 __B) { + return __builtin_ia32_ucomineq((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_si32(__m128 __A) { + return __builtin_ia32_cvtss2si((__v4sf)__A); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_ss2si(__m128 __A) { + return _mm_cvtss_si32(__A); +} + +#ifdef __x86_64__ + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_si64(__m128 __A) { + return __builtin_ia32_cvtss2si64((__v4sf)__A); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_si64x(__m128 __A) { + return __builtin_ia32_cvtss2si64((__v4sf)__A); +} +#endif + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pi32(__m128 __A) { + return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_ps2pi(__m128 __A) { + return _mm_cvtps_pi32(__A); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_si32(__m128 __A) { + return __builtin_ia32_cvttss2si((__v4sf)__A); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_ss2si(__m128 __A) { + return _mm_cvttss_si32(__A); +} + +#ifdef __x86_64__ + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_si64(__m128 __A) { + return __builtin_ia32_cvttss2si64((__v4sf)__A); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttss_si64x(__m128 __A) { + return __builtin_ia32_cvttss2si64((__v4sf)__A); +} +#endif + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvttps_pi32(__m128 __A) { + return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtt_ps2pi(__m128 __A) { + return _mm_cvttps_pi32(__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi32_ss(__m128 __A, int __B) { + return (__m128)__builtin_ia32_cvtsi2ss((__v4sf)__A, __B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_si2ss(__m128 __A, int __B) { + return _mm_cvtsi32_ss(__A, __B); +} + +#ifdef __x86_64__ + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64_ss(__m128 __A, long long __B) { + return (__m128)__builtin_ia32_cvtsi642ss((__v4sf)__A, __B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtsi64x_ss(__m128 __A, long long __B) { + return (__m128)__builtin_ia32_cvtsi642ss((__v4sf)__A, __B); +} +#endif + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi32_ps(__m128 __A, __m64 __B) { + return (__m128)__builtin_ia32_cvtpi2ps((__v4sf)__A, (__v2si)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvt_pi2ps(__m128 __A, __m64 __B) { + return _mm_cvtpi32_ps(__A, __B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi16_ps(__m64 __A) { + __v4hi __sign; + __v2si __hisi, __losi; + __v4sf __zero, __ra, __rb; + __sign = __builtin_ia32_pcmpgtw((__v4hi)0LL, (__v4hi)__A); + __losi = (__v2si)__builtin_ia32_punpcklwd((__v4hi)__A, __sign); + __hisi = (__v2si)__builtin_ia32_punpckhwd((__v4hi)__A, __sign); + __zero = (__v4sf)_mm_setzero_ps(); + __ra = __builtin_ia32_cvtpi2ps(__zero, __losi); + __rb = __builtin_ia32_cvtpi2ps(__ra, __hisi); + return (__m128)__builtin_ia32_movlhps(__ra, __rb); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpu16_ps(__m64 __A) { + __v2si __hisi, __losi; + __v4sf __zero, __ra, __rb; + __losi = (__v2si)__builtin_ia32_punpcklwd((__v4hi)__A, (__v4hi)0LL); + __hisi = (__v2si)__builtin_ia32_punpckhwd((__v4hi)__A, (__v4hi)0LL); + __zero = (__v4sf)_mm_setzero_ps(); + __ra = __builtin_ia32_cvtpi2ps(__zero, __losi); + __rb = __builtin_ia32_cvtpi2ps(__ra, __hisi); + return (__m128)__builtin_ia32_movlhps(__ra, __rb); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi8_ps(__m64 __A) { + __v8qi __sign; + + __sign = __builtin_ia32_pcmpgtb((__v8qi)0LL, (__v8qi)__A); + + __A = (__m64)__builtin_ia32_punpcklbw((__v8qi)__A, __sign); + + return _mm_cvtpi16_ps(__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpu8_ps(__m64 __A) { + __A = (__m64)__builtin_ia32_punpcklbw((__v8qi)__A, (__v8qi)0LL); + return _mm_cvtpu16_ps(__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) { + __v4sf __zero = (__v4sf)_mm_setzero_ps(); + __v4sf __sfa = __builtin_ia32_cvtpi2ps(__zero, (__v2si)__A); + __v4sf __sfb = __builtin_ia32_cvtpi2ps(__sfa, (__v2si)__B); + return (__m128)__builtin_ia32_movlhps(__sfa, __sfb); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pi16(__m128 __A) { + __v4sf __hisf = (__v4sf)__A; + __v4sf __losf = __builtin_ia32_movhlps(__hisf, __hisf); + __v2si __hisi = __builtin_ia32_cvtps2pi(__hisf); + __v2si __losi = __builtin_ia32_cvtps2pi(__losf); + return (__m64)__builtin_ia32_packssdw(__hisi, __losi); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtps_pi8(__m128 __A) { + __v4hi __tmp = (__v4hi)_mm_cvtps_pi16(__A); + return (__m64)__builtin_ia32_packsswb(__tmp, (__v4hi)0LL); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) { + return (__m128)__builtin_ia32_shufps((__v4sf)__A, (__v4sf)__B, __mask); +} +#else +#define _mm_shuffle_ps(A, B, MASK) \ + ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ + (int)(MASK))) +#endif + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpackhi_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_unpckhps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_unpacklo_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_unpcklps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadh_pi(__m128 __A, __m64 const *__P) { + return (__m128)__builtin_ia32_loadhps((__v4sf)__A, (const __v2sf *)__P); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeh_pi(__m64 *__P, __m128 __A) { + __builtin_ia32_storehps((__v2sf *)__P, (__v4sf)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movehl_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_movhlps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movelh_ps(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_movlhps((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadl_pi(__m128 __A, __m64 const *__P) { + return (__m128)__builtin_ia32_loadlps((__v4sf)__A, (const __v2sf *)__P); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storel_pi(__m64 *__P, __m128 __A) { + __builtin_ia32_storelps((__v2sf *)__P, (__v4sf)__A); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movemask_ps(__m128 __A) { + return __builtin_ia32_movmskps((__v4sf)__A); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_getcsr(void) { + return __builtin_ia32_stmxcsr(); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_GET_EXCEPTION_STATE(void) { + return _mm_getcsr() & _MM_EXCEPT_MASK; +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_GET_EXCEPTION_MASK(void) { + return _mm_getcsr() & _MM_MASK_MASK; +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_GET_ROUNDING_MODE(void) { + return _mm_getcsr() & _MM_ROUND_MASK; +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_GET_FLUSH_ZERO_MODE(void) { + return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setcsr(unsigned int __I) { + __builtin_ia32_ldmxcsr(__I); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_SET_EXCEPTION_STATE(unsigned int __mask) { + _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_SET_EXCEPTION_MASK(unsigned int __mask) { + _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_SET_ROUNDING_MODE(unsigned int __mode) { + _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _MM_SET_FLUSH_ZERO_MODE(unsigned int __mode) { + _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_ss(float __F) { + return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f}; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set1_ps(float __F) { + return __extension__(__m128)(__v4sf){__F, __F, __F, __F}; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_set_ps1(float __F) { + return _mm_set1_ps(__F); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_ss(float const *__P) { + return _mm_set_ss(*__P); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load1_ps(float const *__P) { + return _mm_set1_ps(*__P); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_ps1(float const *__P) { + return _mm_load1_ps(__P); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_load_ps(float const *__P) { + return *(__m128 *)__P; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadu_ps(float const *__P) { + return *(__m128_u *)__P; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_loadr_ps(float const *__P) { + __v4sf __tmp = *(__v4sf *)__P; + return (__m128)__builtin_ia32_shufps(__tmp, __tmp, _MM_SHUFFLE(0, 1, 2, 3)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) +_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) { + return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z}; +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_setr_ps(float __Z, float __Y, float __X, float __W) { + return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W}; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_ss(float *__P, __m128 __A) { + *__P = ((__v4sf)__A)[0]; +} + +extern __inline float + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cvtss_f32(__m128 __A) { + return ((__v4sf)__A)[0]; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_ps(float *__P, __m128 __A) { + *(__m128 *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storeu_ps(float *__P, __m128 __A) { + *(__m128_u *)__P = __A; +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store1_ps(float *__P, __m128 __A) { + __v4sf __va = (__v4sf)__A; + __v4sf __tmp = __builtin_ia32_shufps(__va, __va, _MM_SHUFFLE(0, 0, 0, 0)); + _mm_storeu_ps(__P, __tmp); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_store_ps1(float *__P, __m128 __A) { + _mm_store1_ps(__P, __A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_storer_ps(float *__P, __m128 __A) { + __v4sf __va = (__v4sf)__A; + __v4sf __tmp = __builtin_ia32_shufps(__va, __va, _MM_SHUFFLE(0, 1, 2, 3)); + _mm_store_ps(__P, __tmp); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_move_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_shuffle( + (__v4sf)__A, (__v4sf)__B, + __extension__(__attribute__((__vector_size__(16))) int){4, 1, 2, 3}); +} + +#ifdef __OPTIMIZE__ +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_pi16(__m64 const __A, int const __N) { + return __builtin_ia32_vec_ext_v4hi((__v4hi)__A, __N); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pextrw(__m64 const __A, int const __N) { + return _mm_extract_pi16(__A, __N); +} +#else +#define _mm_extract_pi16(A, N) \ + ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)(__m64)(A), (int)(N))) + +#define _m_pextrw(A, N) _mm_extract_pi16(A, N) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_insert_pi16(__m64 const __A, int const __D, int const __N) { + return (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)__A, __D, __N); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pinsrw(__m64 const __A, int const __D, int const __N) { + return _mm_insert_pi16(__A, __D, __N); +} +#else +#define _mm_insert_pi16(A, D, N) \ + ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)(__m64)(A), (int)(D), (int)(N))) + +#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N) +#endif + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_pi16(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pmaxsw((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmaxsw(__m64 __A, __m64 __B) { + return _mm_max_pi16(__A, __B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_max_pu8(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pmaxub((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmaxub(__m64 __A, __m64 __B) { + return _mm_max_pu8(__A, __B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_pi16(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pminsw((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pminsw(__m64 __A, __m64 __B) { + return _mm_min_pi16(__A, __B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_min_pu8(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pminub((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pminub(__m64 __A, __m64 __B) { + return _mm_min_pu8(__A, __B); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_movemask_pi8(__m64 __A) { + return __builtin_ia32_pmovmskb((__v8qi)__A); +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmovmskb(__m64 __A) { + return _mm_movemask_pi8(__A); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mulhi_pu16(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pmulhuw((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pmulhuw(__m64 __A, __m64 __B) { + return _mm_mulhi_pu16(__A, __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shuffle_pi16(__m64 __A, int const __N) { + return (__m64)__builtin_ia32_pshufw((__v4hi)__A, __N); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pshufw(__m64 __A, int const __N) { + return _mm_shuffle_pi16(__A, __N); +} +#else +#define _mm_shuffle_pi16(A, N) \ + ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(A), (int)(N))) + +#define _m_pshufw(A, N) _mm_shuffle_pi16(A, N) +#endif + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) { + __builtin_ia32_maskmovq((__v8qi)__A, (__v8qi)__N, __P); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_maskmovq(__m64 __A, __m64 __N, char *__P) { + _mm_maskmove_si64(__A, __N, __P); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_avg_pu8(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pavgb((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pavgb(__m64 __A, __m64 __B) { + return _mm_avg_pu8(__A, __B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_avg_pu16(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_pavgw((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_pavgw(__m64 __A, __m64 __B) { + return _mm_avg_pu16(__A, __B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sad_pu8(__m64 __A, __m64 __B) { + return (__m64)__builtin_ia32_psadbw((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_psadbw(__m64 __A, __m64 __B) { + return _mm_sad_pu8(__A, __B); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_pi(__m64 *__P, __m64 __A) { + __builtin_ia32_movntq((unsigned long long *)__P, (unsigned long long)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_stream_ps(float *__P, __m128 __A) { + __builtin_ia32_movntps(__P, (__v4sf)__A); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sfence(void) { + __builtin_ia32_sfence(); +} + +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ + __v4sf __t0 = __builtin_ia32_unpcklps(__r0, __r1); \ + __v4sf __t1 = __builtin_ia32_unpcklps(__r2, __r3); \ + __v4sf __t2 = __builtin_ia32_unpckhps(__r0, __r1); \ + __v4sf __t3 = __builtin_ia32_unpckhps(__r2, __r3); \ + (row0) = __builtin_ia32_movlhps(__t0, __t1); \ + (row1) = __builtin_ia32_movhlps(__t1, __t0); \ + (row2) = __builtin_ia32_movlhps(__t2, __t3); \ + (row3) = __builtin_ia32_movhlps(__t3, __t2); \ + } while (0) + +#include "third_party/intel/emmintrin.internal.h" + +#ifdef __DISABLE_SSE__ +#undef __DISABLE_SSE__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_pause(void) { + __builtin_ia32_pause(); +} + +#endif /* _XMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/xopintrin.internal.h b/third_party/intel/xopintrin.internal.h new file mode 100644 index 000000000..992f5a1ea --- /dev/null +++ b/third_party/intel/xopintrin.internal.h @@ -0,0 +1,792 @@ +#ifndef _X86INTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _XOPMMINTRIN_H_INCLUDED +#define _XOPMMINTRIN_H_INCLUDED + +#include "third_party/intel/fma4intrin.internal.h" + +#ifndef __XOP__ +#pragma GCC push_options +#pragma GCC target("xop") +#define __DISABLE_XOP__ +#endif /* __XOP__ */ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, + (__v8hi)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, + (__v8hi)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, + (__v4si)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, + (__v4si)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, + (__v2di)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, + (__v2di)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, + (__v2di)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, + (__v2di)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, + (__v4si)__C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, + (__v4si)__C); +} + +/* Packed Integer Horizontal Add and Subtract */ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddw_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddd_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddq_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddd_epi16(__m128i __A) { + return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddq_epi16(__m128i __A) { + return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddq_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_vphadddq((__v4si)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddw_epu8(__m128i __A) { + return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddd_epu8(__m128i __A) { + return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddq_epu8(__m128i __A) { + return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddd_epu16(__m128i __A) { + return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddq_epu16(__m128i __A) { + return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_haddq_epu32(__m128i __A) { + return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsubw_epi8(__m128i __A) { + return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsubd_epi16(__m128i __A) { + return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_hsubq_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A); +} + +/* Vector conditional move and permute */ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) { + return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, + (__v16qi)__C); +} + +/* Packed Integer Rotates and Shifts + Rotates - Non-Immediate form */ + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rot_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rot_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rot_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_rot_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roti_epi8(__m128i __A, const int __B) { + return (__m128i)__builtin_ia32_vprotbi((__v16qi)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roti_epi16(__m128i __A, const int __B) { + return (__m128i)__builtin_ia32_vprotwi((__v8hi)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roti_epi32(__m128i __A, const int __B) { + return (__m128i)__builtin_ia32_vprotdi((__v4si)__A, __B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roti_epi64(__m128i __A, const int __B) { + return (__m128i)__builtin_ia32_vprotqi((__v2di)__A, __B); +} +#else +#define _mm_roti_epi8(A, N) \ + ((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (int)(N))) +#define _mm_roti_epi16(A, N) \ + ((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_roti_epi32(A, N) \ + ((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (int)(N))) +#define _mm_roti_epi64(A, N) \ + ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (int)(N))) +#endif + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shl_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shl_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shl_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_shl_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_sha_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comlt_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomltub((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comle_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomleub((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comgt_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgtub((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comge_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgeub((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comeq_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomequb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comneq_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomnequb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comfalse_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomfalseub((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comtrue_epu8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomtrueub((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comlt_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomltuw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comle_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomleuw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comgt_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgtuw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comge_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgeuw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comeq_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomequw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comneq_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomnequw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comfalse_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomfalseuw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comtrue_epu16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomtrueuw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comlt_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomltud((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comle_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomleud((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comgt_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgtud((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comge_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgeud((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comeq_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomequd((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comneq_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomnequd((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comfalse_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomfalseud((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comtrue_epu32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomtrueud((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comlt_epu64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomltuq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comle_epu64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomleuq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comgt_epu64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgtuq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comge_epu64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgeuq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comeq_epu64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomequq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comneq_epu64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomnequq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comfalse_epu64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomfalseuq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comtrue_epu64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomtrueuq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comlt_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomltb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comle_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomleb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comgt_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgtb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comge_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgeb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comeq_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomeqb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comneq_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomneqb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comfalse_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomfalseb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comtrue_epi8(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomtrueb((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comlt_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomltw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comle_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomlew((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comgt_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgtw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comge_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgew((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comeq_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomeqw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comneq_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomneqw((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comfalse_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomfalsew((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comtrue_epi16(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomtruew((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comlt_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomltd((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comle_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomled((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comgt_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgtd((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comge_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomged((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comeq_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomeqd((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comneq_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomneqd((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comfalse_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomfalsed((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comtrue_epi32(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomtrued((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comlt_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomltq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comle_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomleq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comgt_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgtq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comge_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomgeq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comeq_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomeqq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comneq_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomneqq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comfalse_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomfalseq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_comtrue_epi64(__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_vpcomtrueq((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_frcz_ps(__m128 __A) { + return (__m128)__builtin_ia32_vfrczps((__v4sf)__A); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_frcz_pd(__m128d __A) { + return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_frcz_ss(__m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_movss( + (__v4sf)__A, (__v4sf)__builtin_ia32_vfrczss((__v4sf)__B)); +} + +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_frcz_sd(__m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_movsd( + (__v2df)__A, (__v2df)__builtin_ia32_vfrczsd((__v2df)__B)); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_frcz_ps(__m256 __A) { + return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_frcz_pd(__m256d __A) { + return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permute2_pd(__m128d __X, __m128d __Y, __m128i __C, const int __I) { + return (__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, + (__v2di)__C, __I); +} + +extern __inline __m256d + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permute2_pd(__m256d __X, __m256d __Y, __m256i __C, const int __I) { + return (__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, + (__v4di)__C, __I); +} + +extern __inline __m128 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_permute2_ps(__m128 __X, __m128 __Y, __m128i __C, const int __I) { + return (__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, + (__v4si)__C, __I); +} + +extern __inline __m256 + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm256_permute2_ps(__m256 __X, __m256 __Y, __m256i __C, const int __I) { + return (__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, + (__v8si)__C, __I); +} +#else +#define _mm_permute2_pd(X, Y, C, I) \ + ((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__v2di)(__m128d)(C), (int)(I))) + +#define _mm256_permute2_pd(X, Y, C, I) \ + ((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (__v4di)(__m256d)(C), (int)(I))) + +#define _mm_permute2_ps(X, Y, C, I) \ + ((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ + (__v4si)(__m128)(C), (int)(I))) + +#define _mm256_permute2_ps(X, Y, C, I) \ + ((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (__v8si)(__m256)(C), (int)(I))) +#endif /* __OPTIMIZE__ */ + +#ifdef __DISABLE_XOP__ +#undef __DISABLE_XOP__ +#pragma GCC pop_options +#endif /* __DISABLE_XOP__ */ + +#endif /* _XOPMMINTRIN_H_INCLUDED */ diff --git a/third_party/intel/xsavecintrin.internal.h b/third_party/intel/xsavecintrin.internal.h new file mode 100644 index 000000000..35ae37ac5 --- /dev/null +++ b/third_party/intel/xsavecintrin.internal.h @@ -0,0 +1,33 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _XSAVECINTRIN_H_INCLUDED +#define _XSAVECINTRIN_H_INCLUDED + +#ifndef __XSAVEC__ +#pragma GCC push_options +#pragma GCC target("xsavec") +#define __DISABLE_XSAVEC__ +#endif /* __XSAVEC__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xsavec(void *__P, long long __M) { + __builtin_ia32_xsavec(__P, __M); +} + +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xsavec64(void *__P, long long __M) { + __builtin_ia32_xsavec64(__P, __M); +} +#endif + +#ifdef __DISABLE_XSAVEC__ +#undef __DISABLE_XSAVEC__ +#pragma GCC pop_options +#endif /* __DISABLE_XSAVEC__ */ + +#endif /* _XSAVECINTRIN_H_INCLUDED */ diff --git a/third_party/intel/xsaveintrin.internal.h b/third_party/intel/xsaveintrin.internal.h new file mode 100644 index 000000000..4b7c4b5f9 --- /dev/null +++ b/third_party/intel/xsaveintrin.internal.h @@ -0,0 +1,57 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _XSAVEINTRIN_H_INCLUDED +#define _XSAVEINTRIN_H_INCLUDED + +#ifndef __XSAVE__ +#pragma GCC push_options +#pragma GCC target("xsave") +#define __DISABLE_XSAVE__ +#endif /* __XSAVE__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xsave(void *__P, long long __M) { + __builtin_ia32_xsave(__P, __M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xrstor(void *__P, long long __M) { + __builtin_ia32_xrstor(__P, __M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xsetbv(unsigned int __A, long long __V) { + __builtin_ia32_xsetbv(__A, __V); +} + +extern __inline long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xgetbv(unsigned int __A) { + return __builtin_ia32_xgetbv(__A); +} + +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xsave64(void *__P, long long __M) { + __builtin_ia32_xsave64(__P, __M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xrstor64(void *__P, long long __M) { + __builtin_ia32_xrstor64(__P, __M); +} +#endif + +#ifdef __DISABLE_XSAVE__ +#undef __DISABLE_XSAVE__ +#pragma GCC pop_options +#endif /* __DISABLE_XSAVE__ */ + +#endif /* _XSAVEINTRIN_H_INCLUDED */ diff --git a/third_party/intel/xsaveoptintrin.internal.h b/third_party/intel/xsaveoptintrin.internal.h new file mode 100644 index 000000000..173779f92 --- /dev/null +++ b/third_party/intel/xsaveoptintrin.internal.h @@ -0,0 +1,33 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _XSAVEOPTINTRIN_H_INCLUDED +#define _XSAVEOPTINTRIN_H_INCLUDED + +#ifndef __XSAVEOPT__ +#pragma GCC push_options +#pragma GCC target("xsaveopt") +#define __DISABLE_XSAVEOPT__ +#endif /* __XSAVEOPT__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xsaveopt(void *__P, long long __M) { + __builtin_ia32_xsaveopt(__P, __M); +} + +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xsaveopt64(void *__P, long long __M) { + __builtin_ia32_xsaveopt64(__P, __M); +} +#endif + +#ifdef __DISABLE_XSAVEOPT__ +#undef __DISABLE_XSAVEOPT__ +#pragma GCC pop_options +#endif /* __DISABLE_XSAVEOPT__ */ + +#endif /* _XSAVEOPTINTRIN_H_INCLUDED */ diff --git a/third_party/intel/xsavesintrin.internal.h b/third_party/intel/xsavesintrin.internal.h new file mode 100644 index 000000000..765f64778 --- /dev/null +++ b/third_party/intel/xsavesintrin.internal.h @@ -0,0 +1,45 @@ +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _XSAVESINTRIN_H_INCLUDED +#define _XSAVESINTRIN_H_INCLUDED + +#ifndef __XSAVES__ +#pragma GCC push_options +#pragma GCC target("xsaves") +#define __DISABLE_XSAVES__ +#endif /* __XSAVES__ */ + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xsaves(void *__P, long long __M) { + __builtin_ia32_xsaves(__P, __M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xrstors(void *__P, long long __M) { + __builtin_ia32_xrstors(__P, __M); +} + +#ifdef __x86_64__ +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xrstors64(void *__P, long long __M) { + __builtin_ia32_xrstors64(__P, __M); +} + +extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xsaves64(void *__P, long long __M) { + __builtin_ia32_xsaves64(__P, __M); +} +#endif + +#ifdef __DISABLE_XSAVES__ +#undef __DISABLE_XSAVES__ +#pragma GCC pop_options +#endif /* __DISABLE_XSAVES__ */ + +#endif /* _XSAVESINTRIN_H_INCLUDED */ diff --git a/third_party/intel/xtestintrin.internal.h b/third_party/intel/xtestintrin.internal.h new file mode 100644 index 000000000..f417dcad1 --- /dev/null +++ b/third_party/intel/xtestintrin.internal.h @@ -0,0 +1,25 @@ +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _XTESTINTRIN_H_INCLUDED +#define _XTESTINTRIN_H_INCLUDED + +#ifndef __RTM__ +#pragma GCC push_options +#pragma GCC target("rtm") +#define __DISABLE_RTM__ +#endif /* __RTM__ */ + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _xtest(void) { + return __builtin_ia32_xtest(); +} + +#ifdef __DISABLE_RTM__ +#undef __DISABLE_RTM__ +#pragma GCC pop_options +#endif /* __DISABLE_RTM__ */ + +#endif /* _XTESTINTRIN_H_INCLUDED */ diff --git a/third_party/quickjs/libbf.c b/third_party/quickjs/libbf.c index 5abb3b147..c370edb17 100644 --- a/third_party/quickjs/libbf.c +++ b/third_party/quickjs/libbf.c @@ -21,16 +21,16 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include "third_party/quickjs/libbf.h" #include "libc/assert.h" -#include "libc/intrin/avxintrin.internal.h" #include "libc/intrin/likely.h" #include "libc/inttypes.h" #include "libc/runtime/runtime.h" #include "libc/stdio/stdio.h" #include "libc/str/str.h" +#include "third_party/intel/immintrin.internal.h" #include "third_party/quickjs/cutils.h" #include "third_party/quickjs/diglet.h" -#include "third_party/quickjs/libbf.h" asm(".ident\t\"\\n\\n\ QuickJS (MIT License)\\n\ diff --git a/third_party/zlib/adler32simd.c b/third_party/zlib/adler32simd.c index 999a8fdde..ee5568bc8 100644 --- a/third_party/zlib/adler32simd.c +++ b/third_party/zlib/adler32simd.c @@ -5,8 +5,8 @@ │ Use of this source code is governed by the BSD-style licenses that can │ │ be found in the third_party/zlib/LICENSE file. │ ╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/emmintrin.internal.h" -#include "libc/intrin/tmmintrin.internal.h" +#include "third_party/intel/emmintrin.internal.h" +#include "third_party/intel/tmmintrin.internal.h" #include "third_party/zlib/internal.h" asm(".ident\t\"\\n\\n\ diff --git a/tool/build/emubin/metalsha256.c b/tool/build/emubin/metalsha256.c index e27dc861f..2d68469c5 100644 --- a/tool/build/emubin/metalsha256.c +++ b/tool/build/emubin/metalsha256.c @@ -16,9 +16,8 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/xmmintrin.internal.h" -#include "libc/intrin/repstosb.h" #include "tool/build/emubin/metalsha256.h" +#include "libc/intrin/repstosb.h" #define ROTR(a, b) (((a) >> (b)) | ((a) << (32 - (b)))) #define CH(x, y, z) (((x) & (y)) ^ (~(x) & (z))) diff --git a/tool/viz/lib/convolve.h b/tool/viz/lib/convolve.h index 832276b3d..2967a8281 100644 --- a/tool/viz/lib/convolve.h +++ b/tool/viz/lib/convolve.h @@ -1,6 +1,5 @@ #ifndef COSMOPOLITAN_TOOL_VIZ_LIB_CONVOLVE_H_ #define COSMOPOLITAN_TOOL_VIZ_LIB_CONVOLVE_H_ -#include "libc/intrin/xmmintrin.internal.h" #include "libc/str/str.h" #include "tool/viz/lib/graphic.h" #if !(__ASSEMBLER__ + __LINKER__ + 0) diff --git a/tool/viz/lib/gaussian.c b/tool/viz/lib/gaussian.c index bc6f19356..d10f298d9 100644 --- a/tool/viz/lib/gaussian.c +++ b/tool/viz/lib/gaussian.c @@ -16,7 +16,6 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/intrin/xmmintrin.internal.h" #include "libc/macros.internal.h" #include "libc/mem/mem.h" #include "libc/str/str.h" diff --git a/tool/viz/lib/getxtermcodes.c b/tool/viz/lib/getxtermcodes.c index f320875ab..3241c396f 100644 --- a/tool/viz/lib/getxtermcodes.c +++ b/tool/viz/lib/getxtermcodes.c @@ -17,7 +17,6 @@ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "dsp/tty/quant.h" -#include "libc/intrin/xmmintrin.internal.h" #include "libc/macros.internal.h" #include "tool/viz/lib/graphic.h" diff --git a/tool/viz/lib/resizegraphic.c b/tool/viz/lib/resizegraphic.c index d17e53540..9ceedc1bb 100644 --- a/tool/viz/lib/resizegraphic.c +++ b/tool/viz/lib/resizegraphic.c @@ -17,7 +17,6 @@ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/assert.h" -#include "libc/intrin/xmmintrin.internal.h" #include "libc/runtime/buffer.internal.h" #include "tool/viz/lib/graphic.h" diff --git a/tool/viz/lib/ycbcr2rgb3.c b/tool/viz/lib/ycbcr2rgb3.c index a1d9c1cf7..6732fb6d7 100644 --- a/tool/viz/lib/ycbcr2rgb3.c +++ b/tool/viz/lib/ycbcr2rgb3.c @@ -28,7 +28,6 @@ #include "libc/calls/calls.h" #include "libc/calls/struct/sigset.h" #include "libc/intrin/pmulhrsw.h" -#include "libc/intrin/xmmintrin.internal.h" #include "libc/log/check.h" #include "libc/log/log.h" #include "libc/macros.internal.h"