From 80db9de173a4a1eea5ce925606371754a984193d Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Mon, 15 May 2023 23:11:47 -0700 Subject: [PATCH] Make the intrinsics more readable --- libc/integral/c.inc | 4 + third_party/aarch64/arm_fp16.h | 182 +- third_party/aarch64/arm_neon.h | 8508 +++++++++-------- third_party/ggml/ggml.c | 48 +- third_party/intel/adxintrin.internal.h | 39 +- third_party/intel/ammintrin.internal.h | 28 +- third_party/intel/avx2intrin.internal.h | 838 +- .../intel/avx5124fmapsintrin.internal.h | 80 +- .../intel/avx5124vnniwintrin.internal.h | 47 +- .../intel/avx512bitalgintrin.internal.h | 103 +- third_party/intel/avx512bwintrin.internal.h | 1438 +-- third_party/intel/avx512cdintrin.internal.h | 60 +- third_party/intel/avx512dqintrin.internal.h | 973 +- third_party/intel/avx512erintrin.internal.h | 99 +- third_party/intel/avx512fintrin.internal.h | 6565 +++++-------- third_party/intel/avx512ifmaintrin.internal.h | 32 +- .../intel/avx512ifmavlintrin.internal.h | 60 +- third_party/intel/avx512pfintrin.internal.h | 113 +- .../intel/avx512vbmi2intrin.internal.h | 238 +- .../intel/avx512vbmi2vlintrin.internal.h | 474 +- third_party/intel/avx512vbmiintrin.internal.h | 53 +- .../intel/avx512vbmivlintrin.internal.h | 105 +- third_party/intel/avx512vlbwintrin.internal.h | 2038 ++-- third_party/intel/avx512vldqintrin.internal.h | 773 +- third_party/intel/avx512vlintrin.internal.h | 5770 ++++------- third_party/intel/avx512vnniintrin.internal.h | 62 +- .../intel/avx512vnnivlintrin.internal.h | 114 +- .../intel/avx512vpopcntdqintrin.internal.h | 26 +- .../intel/avx512vpopcntdqvlintrin.internal.h | 50 +- third_party/intel/avxintrin.internal.h | 765 +- third_party/intel/bmi2intrin.internal.h | 39 +- third_party/intel/cetintrin.internal.h | 44 +- third_party/intel/cldemoteintrin.internal.h | 4 +- third_party/intel/clflushoptintrin.internal.h | 4 +- third_party/intel/clwbintrin.internal.h | 4 +- third_party/intel/clzerointrin.internal.h | 4 +- third_party/intel/emmintrin.internal.h | 941 +- third_party/intel/f16cintrin.internal.h | 24 +- third_party/intel/fma4intrin.internal.h | 128 +- third_party/intel/fmaintrin.internal.h | 128 +- third_party/intel/fxsrintrin.internal.h | 16 +- third_party/intel/gfniintrin.internal.h | 161 +- third_party/intel/ia32intrin.internal.h | 113 +- third_party/intel/immintrin.internal.h | 60 +- third_party/intel/lwpintrin.internal.h | 29 +- third_party/intel/lzcntintrin.internal.h | 20 +- third_party/intel/mm3dnow.internal.h | 112 +- third_party/intel/mmintrin.internal.h | 520 +- third_party/intel/movdirintrin.internal.h | 12 +- third_party/intel/mwaitxintrin.internal.h | 8 +- third_party/intel/pconfigintrin.internal.h | 4 +- third_party/intel/pkuintrin.internal.h | 8 +- third_party/intel/pmmintrin.internal.h | 52 +- third_party/intel/popcntintrin.internal.h | 8 +- third_party/intel/prfchwintrin.internal.h | 4 +- third_party/intel/rdseedintrin.internal.h | 12 +- third_party/intel/rtmintrin.internal.h | 12 +- third_party/intel/sgxintrin.internal.h | 12 +- third_party/intel/shaintrin.internal.h | 28 +- third_party/intel/smmintrin.internal.h | 284 +- third_party/intel/tbmintrin.internal.h | 81 +- third_party/intel/tmmintrin.internal.h | 128 +- third_party/intel/vaesintrin.internal.h | 32 +- third_party/intel/vpclmulqdqintrin.internal.h | 10 +- third_party/intel/waitpkgintrin.internal.h | 12 +- third_party/intel/wbnoinvdintrin.internal.h | 4 +- third_party/intel/wmmintrin.internal.h | 28 +- third_party/intel/xmmintrin.internal.h | 621 +- third_party/intel/xopintrin.internal.h | 480 +- third_party/intel/xsavecintrin.internal.h | 8 +- third_party/intel/xsaveintrin.internal.h | 24 +- third_party/intel/xsaveoptintrin.internal.h | 8 +- third_party/intel/xsavesintrin.internal.h | 16 +- third_party/intel/xtestintrin.internal.h | 4 +- tool/emacs/cosmo-c-keywords.el | 1 + 75 files changed, 12444 insertions(+), 21493 deletions(-) diff --git a/libc/integral/c.inc b/libc/integral/c.inc index bb962797c..1aca67eae 100644 --- a/libc/integral/c.inc +++ b/libc/integral/c.inc @@ -854,5 +854,9 @@ typedef struct { asm(".weak\t" #alias "\n\t" \ ".equ\t" #alias ", " #sym) +#define __funline \ + extern __inline \ + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + #define MACHINE_CODE_ANALYSIS_BEGIN_ #define MACHINE_CODE_ANALYSIS_END_ diff --git a/third_party/aarch64/arm_fp16.h b/third_party/aarch64/arm_fp16.h index 2a0cf3df4..74e9a181d 100644 --- a/third_party/aarch64/arm_fp16.h +++ b/third_party/aarch64/arm_fp16.h @@ -8,365 +8,361 @@ #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+fp16") -#define FUNC \ - __extension__ extern __inline \ - __attribute__((__always_inline__, __gnu_inline__, __artificial__)) - typedef __fp16 float16_t; -FUNC float16_t vabsh_f16(float16_t __a) { +__funline float16_t vabsh_f16(float16_t __a) { return __builtin_aarch64_abshf(__a); } -FUNC uint16_t vceqzh_f16(float16_t __a) { +__funline uint16_t vceqzh_f16(float16_t __a) { return __builtin_aarch64_cmeqhf_uss(__a, 0.0f); } -FUNC uint16_t vcgezh_f16(float16_t __a) { +__funline uint16_t vcgezh_f16(float16_t __a) { return __builtin_aarch64_cmgehf_uss(__a, 0.0f); } -FUNC uint16_t vcgtzh_f16(float16_t __a) { +__funline uint16_t vcgtzh_f16(float16_t __a) { return __builtin_aarch64_cmgthf_uss(__a, 0.0f); } -FUNC uint16_t vclezh_f16(float16_t __a) { +__funline uint16_t vclezh_f16(float16_t __a) { return __builtin_aarch64_cmlehf_uss(__a, 0.0f); } -FUNC uint16_t vcltzh_f16(float16_t __a) { +__funline uint16_t vcltzh_f16(float16_t __a) { return __builtin_aarch64_cmlthf_uss(__a, 0.0f); } -FUNC float16_t vcvth_f16_s16(int16_t __a) { +__funline float16_t vcvth_f16_s16(int16_t __a) { return __builtin_aarch64_floathihf(__a); } -FUNC float16_t vcvth_f16_s32(int32_t __a) { +__funline float16_t vcvth_f16_s32(int32_t __a) { return __builtin_aarch64_floatsihf(__a); } -FUNC float16_t vcvth_f16_s64(int64_t __a) { +__funline float16_t vcvth_f16_s64(int64_t __a) { return __builtin_aarch64_floatdihf(__a); } -FUNC float16_t vcvth_f16_u16(uint16_t __a) { +__funline float16_t vcvth_f16_u16(uint16_t __a) { return __builtin_aarch64_floatunshihf_us(__a); } -FUNC float16_t vcvth_f16_u32(uint32_t __a) { +__funline float16_t vcvth_f16_u32(uint32_t __a) { return __builtin_aarch64_floatunssihf_us(__a); } -FUNC float16_t vcvth_f16_u64(uint64_t __a) { +__funline float16_t vcvth_f16_u64(uint64_t __a) { return __builtin_aarch64_floatunsdihf_us(__a); } -FUNC int16_t vcvth_s16_f16(float16_t __a) { +__funline int16_t vcvth_s16_f16(float16_t __a) { return __builtin_aarch64_fix_trunchfhi(__a); } -FUNC int32_t vcvth_s32_f16(float16_t __a) { +__funline int32_t vcvth_s32_f16(float16_t __a) { return __builtin_aarch64_fix_trunchfsi(__a); } -FUNC int64_t vcvth_s64_f16(float16_t __a) { +__funline int64_t vcvth_s64_f16(float16_t __a) { return __builtin_aarch64_fix_trunchfdi(__a); } -FUNC uint16_t vcvth_u16_f16(float16_t __a) { +__funline uint16_t vcvth_u16_f16(float16_t __a) { return __builtin_aarch64_fixuns_trunchfhi_us(__a); } -FUNC uint32_t vcvth_u32_f16(float16_t __a) { +__funline uint32_t vcvth_u32_f16(float16_t __a) { return __builtin_aarch64_fixuns_trunchfsi_us(__a); } -FUNC uint64_t vcvth_u64_f16(float16_t __a) { +__funline uint64_t vcvth_u64_f16(float16_t __a) { return __builtin_aarch64_fixuns_trunchfdi_us(__a); } -FUNC int16_t vcvtah_s16_f16(float16_t __a) { +__funline int16_t vcvtah_s16_f16(float16_t __a) { return __builtin_aarch64_lroundhfhi(__a); } -FUNC int32_t vcvtah_s32_f16(float16_t __a) { +__funline int32_t vcvtah_s32_f16(float16_t __a) { return __builtin_aarch64_lroundhfsi(__a); } -FUNC int64_t vcvtah_s64_f16(float16_t __a) { +__funline int64_t vcvtah_s64_f16(float16_t __a) { return __builtin_aarch64_lroundhfdi(__a); } -FUNC uint16_t vcvtah_u16_f16(float16_t __a) { +__funline uint16_t vcvtah_u16_f16(float16_t __a) { return __builtin_aarch64_lrounduhfhi_us(__a); } -FUNC uint32_t vcvtah_u32_f16(float16_t __a) { +__funline uint32_t vcvtah_u32_f16(float16_t __a) { return __builtin_aarch64_lrounduhfsi_us(__a); } -FUNC uint64_t vcvtah_u64_f16(float16_t __a) { +__funline uint64_t vcvtah_u64_f16(float16_t __a) { return __builtin_aarch64_lrounduhfdi_us(__a); } -FUNC int16_t vcvtmh_s16_f16(float16_t __a) { +__funline int16_t vcvtmh_s16_f16(float16_t __a) { return __builtin_aarch64_lfloorhfhi(__a); } -FUNC int32_t vcvtmh_s32_f16(float16_t __a) { +__funline int32_t vcvtmh_s32_f16(float16_t __a) { return __builtin_aarch64_lfloorhfsi(__a); } -FUNC int64_t vcvtmh_s64_f16(float16_t __a) { +__funline int64_t vcvtmh_s64_f16(float16_t __a) { return __builtin_aarch64_lfloorhfdi(__a); } -FUNC uint16_t vcvtmh_u16_f16(float16_t __a) { +__funline uint16_t vcvtmh_u16_f16(float16_t __a) { return __builtin_aarch64_lflooruhfhi_us(__a); } -FUNC uint32_t vcvtmh_u32_f16(float16_t __a) { +__funline uint32_t vcvtmh_u32_f16(float16_t __a) { return __builtin_aarch64_lflooruhfsi_us(__a); } -FUNC uint64_t vcvtmh_u64_f16(float16_t __a) { +__funline uint64_t vcvtmh_u64_f16(float16_t __a) { return __builtin_aarch64_lflooruhfdi_us(__a); } -FUNC int16_t vcvtnh_s16_f16(float16_t __a) { +__funline int16_t vcvtnh_s16_f16(float16_t __a) { return __builtin_aarch64_lfrintnhfhi(__a); } -FUNC int32_t vcvtnh_s32_f16(float16_t __a) { +__funline int32_t vcvtnh_s32_f16(float16_t __a) { return __builtin_aarch64_lfrintnhfsi(__a); } -FUNC int64_t vcvtnh_s64_f16(float16_t __a) { +__funline int64_t vcvtnh_s64_f16(float16_t __a) { return __builtin_aarch64_lfrintnhfdi(__a); } -FUNC uint16_t vcvtnh_u16_f16(float16_t __a) { +__funline uint16_t vcvtnh_u16_f16(float16_t __a) { return __builtin_aarch64_lfrintnuhfhi_us(__a); } -FUNC uint32_t vcvtnh_u32_f16(float16_t __a) { +__funline uint32_t vcvtnh_u32_f16(float16_t __a) { return __builtin_aarch64_lfrintnuhfsi_us(__a); } -FUNC uint64_t vcvtnh_u64_f16(float16_t __a) { +__funline uint64_t vcvtnh_u64_f16(float16_t __a) { return __builtin_aarch64_lfrintnuhfdi_us(__a); } -FUNC int16_t vcvtph_s16_f16(float16_t __a) { +__funline int16_t vcvtph_s16_f16(float16_t __a) { return __builtin_aarch64_lceilhfhi(__a); } -FUNC int32_t vcvtph_s32_f16(float16_t __a) { +__funline int32_t vcvtph_s32_f16(float16_t __a) { return __builtin_aarch64_lceilhfsi(__a); } -FUNC int64_t vcvtph_s64_f16(float16_t __a) { +__funline int64_t vcvtph_s64_f16(float16_t __a) { return __builtin_aarch64_lceilhfdi(__a); } -FUNC uint16_t vcvtph_u16_f16(float16_t __a) { +__funline uint16_t vcvtph_u16_f16(float16_t __a) { return __builtin_aarch64_lceiluhfhi_us(__a); } -FUNC uint32_t vcvtph_u32_f16(float16_t __a) { +__funline uint32_t vcvtph_u32_f16(float16_t __a) { return __builtin_aarch64_lceiluhfsi_us(__a); } -FUNC uint64_t vcvtph_u64_f16(float16_t __a) { +__funline uint64_t vcvtph_u64_f16(float16_t __a) { return __builtin_aarch64_lceiluhfdi_us(__a); } -FUNC float16_t vnegh_f16(float16_t __a) { +__funline float16_t vnegh_f16(float16_t __a) { return __builtin_aarch64_neghf(__a); } -FUNC float16_t vrecpeh_f16(float16_t __a) { +__funline float16_t vrecpeh_f16(float16_t __a) { return __builtin_aarch64_frecpehf(__a); } -FUNC float16_t vrecpxh_f16(float16_t __a) { +__funline float16_t vrecpxh_f16(float16_t __a) { return __builtin_aarch64_frecpxhf(__a); } -FUNC float16_t vrndh_f16(float16_t __a) { +__funline float16_t vrndh_f16(float16_t __a) { return __builtin_aarch64_btrunchf(__a); } -FUNC float16_t vrndah_f16(float16_t __a) { +__funline float16_t vrndah_f16(float16_t __a) { return __builtin_aarch64_roundhf(__a); } -FUNC float16_t vrndih_f16(float16_t __a) { +__funline float16_t vrndih_f16(float16_t __a) { return __builtin_aarch64_nearbyinthf(__a); } -FUNC float16_t vrndmh_f16(float16_t __a) { +__funline float16_t vrndmh_f16(float16_t __a) { return __builtin_aarch64_floorhf(__a); } -FUNC float16_t vrndnh_f16(float16_t __a) { +__funline float16_t vrndnh_f16(float16_t __a) { return __builtin_aarch64_frintnhf(__a); } -FUNC float16_t vrndph_f16(float16_t __a) { +__funline float16_t vrndph_f16(float16_t __a) { return __builtin_aarch64_ceilhf(__a); } -FUNC float16_t vrndxh_f16(float16_t __a) { +__funline float16_t vrndxh_f16(float16_t __a) { return __builtin_aarch64_rinthf(__a); } -FUNC float16_t vrsqrteh_f16(float16_t __a) { +__funline float16_t vrsqrteh_f16(float16_t __a) { return __builtin_aarch64_rsqrtehf(__a); } -FUNC float16_t vsqrth_f16(float16_t __a) { +__funline float16_t vsqrth_f16(float16_t __a) { return __builtin_aarch64_sqrthf(__a); } -FUNC float16_t vaddh_f16(float16_t __a, float16_t __b) { +__funline float16_t vaddh_f16(float16_t __a, float16_t __b) { return __a + __b; } -FUNC float16_t vabdh_f16(float16_t __a, float16_t __b) { +__funline float16_t vabdh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_fabdhf(__a, __b); } -FUNC uint16_t vcageh_f16(float16_t __a, float16_t __b) { +__funline uint16_t vcageh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_facgehf_uss(__a, __b); } -FUNC uint16_t vcagth_f16(float16_t __a, float16_t __b) { +__funline uint16_t vcagth_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_facgthf_uss(__a, __b); } -FUNC uint16_t vcaleh_f16(float16_t __a, float16_t __b) { +__funline uint16_t vcaleh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_faclehf_uss(__a, __b); } -FUNC uint16_t vcalth_f16(float16_t __a, float16_t __b) { +__funline uint16_t vcalth_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_faclthf_uss(__a, __b); } -FUNC uint16_t vceqh_f16(float16_t __a, float16_t __b) { +__funline uint16_t vceqh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_cmeqhf_uss(__a, __b); } -FUNC uint16_t vcgeh_f16(float16_t __a, float16_t __b) { +__funline uint16_t vcgeh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_cmgehf_uss(__a, __b); } -FUNC uint16_t vcgth_f16(float16_t __a, float16_t __b) { +__funline uint16_t vcgth_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_cmgthf_uss(__a, __b); } -FUNC uint16_t vcleh_f16(float16_t __a, float16_t __b) { +__funline uint16_t vcleh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_cmlehf_uss(__a, __b); } -FUNC uint16_t vclth_f16(float16_t __a, float16_t __b) { +__funline uint16_t vclth_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_cmlthf_uss(__a, __b); } -FUNC float16_t vcvth_n_f16_s16(int16_t __a, const int __b) { +__funline float16_t vcvth_n_f16_s16(int16_t __a, const int __b) { return __builtin_aarch64_scvtfhi(__a, __b); } -FUNC float16_t vcvth_n_f16_s32(int32_t __a, const int __b) { +__funline float16_t vcvth_n_f16_s32(int32_t __a, const int __b) { return __builtin_aarch64_scvtfsihf(__a, __b); } -FUNC float16_t vcvth_n_f16_s64(int64_t __a, const int __b) { +__funline float16_t vcvth_n_f16_s64(int64_t __a, const int __b) { return __builtin_aarch64_scvtfdihf(__a, __b); } -FUNC float16_t vcvth_n_f16_u16(uint16_t __a, const int __b) { +__funline float16_t vcvth_n_f16_u16(uint16_t __a, const int __b) { return __builtin_aarch64_ucvtfhi_sus(__a, __b); } -FUNC float16_t vcvth_n_f16_u32(uint32_t __a, const int __b) { +__funline float16_t vcvth_n_f16_u32(uint32_t __a, const int __b) { return __builtin_aarch64_ucvtfsihf_sus(__a, __b); } -FUNC float16_t vcvth_n_f16_u64(uint64_t __a, const int __b) { +__funline float16_t vcvth_n_f16_u64(uint64_t __a, const int __b) { return __builtin_aarch64_ucvtfdihf_sus(__a, __b); } -FUNC int16_t vcvth_n_s16_f16(float16_t __a, const int __b) { +__funline int16_t vcvth_n_s16_f16(float16_t __a, const int __b) { return __builtin_aarch64_fcvtzshf(__a, __b); } -FUNC int32_t vcvth_n_s32_f16(float16_t __a, const int __b) { +__funline int32_t vcvth_n_s32_f16(float16_t __a, const int __b) { return __builtin_aarch64_fcvtzshfsi(__a, __b); } -FUNC int64_t vcvth_n_s64_f16(float16_t __a, const int __b) { +__funline int64_t vcvth_n_s64_f16(float16_t __a, const int __b) { return __builtin_aarch64_fcvtzshfdi(__a, __b); } -FUNC uint16_t vcvth_n_u16_f16(float16_t __a, const int __b) { +__funline uint16_t vcvth_n_u16_f16(float16_t __a, const int __b) { return __builtin_aarch64_fcvtzuhf_uss(__a, __b); } -FUNC uint32_t vcvth_n_u32_f16(float16_t __a, const int __b) { +__funline uint32_t vcvth_n_u32_f16(float16_t __a, const int __b) { return __builtin_aarch64_fcvtzuhfsi_uss(__a, __b); } -FUNC uint64_t vcvth_n_u64_f16(float16_t __a, const int __b) { +__funline uint64_t vcvth_n_u64_f16(float16_t __a, const int __b) { return __builtin_aarch64_fcvtzuhfdi_uss(__a, __b); } -FUNC float16_t vdivh_f16(float16_t __a, float16_t __b) { +__funline float16_t vdivh_f16(float16_t __a, float16_t __b) { return __a / __b; } -FUNC float16_t vmaxh_f16(float16_t __a, float16_t __b) { +__funline float16_t vmaxh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_fmaxhf(__a, __b); } -FUNC float16_t vmaxnmh_f16(float16_t __a, float16_t __b) { +__funline float16_t vmaxnmh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_fmaxhf(__a, __b); } -FUNC float16_t vminh_f16(float16_t __a, float16_t __b) { +__funline float16_t vminh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_fminhf(__a, __b); } -FUNC float16_t vminnmh_f16(float16_t __a, float16_t __b) { +__funline float16_t vminnmh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_fminhf(__a, __b); } -FUNC float16_t vmulh_f16(float16_t __a, float16_t __b) { +__funline float16_t vmulh_f16(float16_t __a, float16_t __b) { return __a * __b; } -FUNC float16_t vmulxh_f16(float16_t __a, float16_t __b) { +__funline float16_t vmulxh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_fmulxhf(__a, __b); } -FUNC float16_t vrecpsh_f16(float16_t __a, float16_t __b) { +__funline float16_t vrecpsh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_frecpshf(__a, __b); } -FUNC float16_t vrsqrtsh_f16(float16_t __a, float16_t __b) { +__funline float16_t vrsqrtsh_f16(float16_t __a, float16_t __b) { return __builtin_aarch64_rsqrtshf(__a, __b); } -FUNC float16_t vsubh_f16(float16_t __a, float16_t __b) { +__funline float16_t vsubh_f16(float16_t __a, float16_t __b) { return __a - __b; } -FUNC float16_t vfmah_f16(float16_t __a, float16_t __b, float16_t __c) { +__funline float16_t vfmah_f16(float16_t __a, float16_t __b, float16_t __c) { return __builtin_aarch64_fmahf(__b, __c, __a); } -FUNC float16_t vfmsh_f16(float16_t __a, float16_t __b, float16_t __c) { +__funline float16_t vfmsh_f16(float16_t __a, float16_t __b, float16_t __c) { return __builtin_aarch64_fnmahf(__b, __c, __a); } diff --git a/third_party/aarch64/arm_neon.h b/third_party/aarch64/arm_neon.h index b14102570..abbf80d17 100644 --- a/third_party/aarch64/arm_neon.h +++ b/third_party/aarch64/arm_neon.h @@ -14,10 +14,6 @@ #define __AARCH64_UINT64_C(__C) ((uint64_t)__C) #define __AARCH64_INT64_C(__C) ((int64_t)__C) -#define FUNK \ - __extension__ extern __inline \ - __attribute__((__always_inline__, __gnu_inline__, __artificial__)) - typedef __Int8x8_t int8x8_t; typedef __Int16x4_t int16x4_t; typedef __Int32x2_t int32x2_t; @@ -531,3251 +527,3266 @@ typedef struct poly16x8x4_t { __vec; \ }) -FUNK int8x8_t vadd_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vadd_s8(int8x8_t __a, int8x8_t __b) { return __a + __b; } -FUNK int16x4_t vadd_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vadd_s16(int16x4_t __a, int16x4_t __b) { return __a + __b; } -FUNK int32x2_t vadd_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vadd_s32(int32x2_t __a, int32x2_t __b) { return __a + __b; } -FUNK float32x2_t vadd_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vadd_f32(float32x2_t __a, float32x2_t __b) { return __a + __b; } -FUNK float64x1_t vadd_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vadd_f64(float64x1_t __a, float64x1_t __b) { return __a + __b; } -FUNK uint8x8_t vadd_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vadd_u8(uint8x8_t __a, uint8x8_t __b) { return __a + __b; } -FUNK uint16x4_t vadd_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vadd_u16(uint16x4_t __a, uint16x4_t __b) { return __a + __b; } -FUNK uint32x2_t vadd_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vadd_u32(uint32x2_t __a, uint32x2_t __b) { return __a + __b; } -FUNK int64x1_t vadd_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vadd_s64(int64x1_t __a, int64x1_t __b) { return __a + __b; } -FUNK uint64x1_t vadd_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vadd_u64(uint64x1_t __a, uint64x1_t __b) { return __a + __b; } -FUNK int8x16_t vaddq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vaddq_s8(int8x16_t __a, int8x16_t __b) { return __a + __b; } -FUNK int16x8_t vaddq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vaddq_s16(int16x8_t __a, int16x8_t __b) { return __a + __b; } -FUNK int32x4_t vaddq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vaddq_s32(int32x4_t __a, int32x4_t __b) { return __a + __b; } -FUNK int64x2_t vaddq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vaddq_s64(int64x2_t __a, int64x2_t __b) { return __a + __b; } -FUNK float32x4_t vaddq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vaddq_f32(float32x4_t __a, float32x4_t __b) { return __a + __b; } -FUNK float64x2_t vaddq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vaddq_f64(float64x2_t __a, float64x2_t __b) { return __a + __b; } -FUNK uint8x16_t vaddq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vaddq_u8(uint8x16_t __a, uint8x16_t __b) { return __a + __b; } -FUNK uint16x8_t vaddq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vaddq_u16(uint16x8_t __a, uint16x8_t __b) { return __a + __b; } -FUNK uint32x4_t vaddq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vaddq_u32(uint32x4_t __a, uint32x4_t __b) { return __a + __b; } -FUNK uint64x2_t vaddq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vaddq_u64(uint64x2_t __a, uint64x2_t __b) { return __a + __b; } -FUNK int16x8_t vaddl_s8(int8x8_t __a, int8x8_t __b) { +__funline int16x8_t vaddl_s8(int8x8_t __a, int8x8_t __b) { return (int16x8_t)__builtin_aarch64_saddlv8qi(__a, __b); } -FUNK int32x4_t vaddl_s16(int16x4_t __a, int16x4_t __b) { +__funline int32x4_t vaddl_s16(int16x4_t __a, int16x4_t __b) { return (int32x4_t)__builtin_aarch64_saddlv4hi(__a, __b); } -FUNK int64x2_t vaddl_s32(int32x2_t __a, int32x2_t __b) { +__funline int64x2_t vaddl_s32(int32x2_t __a, int32x2_t __b) { return (int64x2_t)__builtin_aarch64_saddlv2si(__a, __b); } -FUNK uint16x8_t vaddl_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint16x8_t vaddl_u8(uint8x8_t __a, uint8x8_t __b) { return (uint16x8_t)__builtin_aarch64_uaddlv8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK uint32x4_t vaddl_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint32x4_t vaddl_u16(uint16x4_t __a, uint16x4_t __b) { return (uint32x4_t)__builtin_aarch64_uaddlv4hi((int16x4_t)__a, (int16x4_t)__b); } -FUNK uint64x2_t vaddl_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint64x2_t vaddl_u32(uint32x2_t __a, uint32x2_t __b) { return (uint64x2_t)__builtin_aarch64_uaddlv2si((int32x2_t)__a, (int32x2_t)__b); } -FUNK int16x8_t vaddl_high_s8(int8x16_t __a, int8x16_t __b) { +__funline int16x8_t vaddl_high_s8(int8x16_t __a, int8x16_t __b) { return (int16x8_t)__builtin_aarch64_saddl2v16qi(__a, __b); } -FUNK int32x4_t vaddl_high_s16(int16x8_t __a, int16x8_t __b) { +__funline int32x4_t vaddl_high_s16(int16x8_t __a, int16x8_t __b) { return (int32x4_t)__builtin_aarch64_saddl2v8hi(__a, __b); } -FUNK int64x2_t vaddl_high_s32(int32x4_t __a, int32x4_t __b) { +__funline int64x2_t vaddl_high_s32(int32x4_t __a, int32x4_t __b) { return (int64x2_t)__builtin_aarch64_saddl2v4si(__a, __b); } -FUNK uint16x8_t vaddl_high_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint16x8_t vaddl_high_u8(uint8x16_t __a, uint8x16_t __b) { return (uint16x8_t)__builtin_aarch64_uaddl2v16qi((int8x16_t)__a, (int8x16_t)__b); } -FUNK uint32x4_t vaddl_high_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint32x4_t vaddl_high_u16(uint16x8_t __a, uint16x8_t __b) { return (uint32x4_t)__builtin_aarch64_uaddl2v8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint64x2_t vaddl_high_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint64x2_t vaddl_high_u32(uint32x4_t __a, uint32x4_t __b) { return (uint64x2_t)__builtin_aarch64_uaddl2v4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK int16x8_t vaddw_s8(int16x8_t __a, int8x8_t __b) { +__funline int16x8_t vaddw_s8(int16x8_t __a, int8x8_t __b) { return (int16x8_t)__builtin_aarch64_saddwv8qi(__a, __b); } -FUNK int32x4_t vaddw_s16(int32x4_t __a, int16x4_t __b) { +__funline int32x4_t vaddw_s16(int32x4_t __a, int16x4_t __b) { return (int32x4_t)__builtin_aarch64_saddwv4hi(__a, __b); } -FUNK int64x2_t vaddw_s32(int64x2_t __a, int32x2_t __b) { +__funline int64x2_t vaddw_s32(int64x2_t __a, int32x2_t __b) { return (int64x2_t)__builtin_aarch64_saddwv2si(__a, __b); } -FUNK uint16x8_t vaddw_u8(uint16x8_t __a, uint8x8_t __b) { +__funline uint16x8_t vaddw_u8(uint16x8_t __a, uint8x8_t __b) { return (uint16x8_t)__builtin_aarch64_uaddwv8qi((int16x8_t)__a, (int8x8_t)__b); } -FUNK uint32x4_t vaddw_u16(uint32x4_t __a, uint16x4_t __b) { +__funline uint32x4_t vaddw_u16(uint32x4_t __a, uint16x4_t __b) { return (uint32x4_t)__builtin_aarch64_uaddwv4hi((int32x4_t)__a, (int16x4_t)__b); } -FUNK uint64x2_t vaddw_u32(uint64x2_t __a, uint32x2_t __b) { +__funline uint64x2_t vaddw_u32(uint64x2_t __a, uint32x2_t __b) { return (uint64x2_t)__builtin_aarch64_uaddwv2si((int64x2_t)__a, (int32x2_t)__b); } -FUNK int16x8_t vaddw_high_s8(int16x8_t __a, int8x16_t __b) { +__funline int16x8_t vaddw_high_s8(int16x8_t __a, int8x16_t __b) { return (int16x8_t)__builtin_aarch64_saddw2v16qi(__a, __b); } -FUNK int32x4_t vaddw_high_s16(int32x4_t __a, int16x8_t __b) { +__funline int32x4_t vaddw_high_s16(int32x4_t __a, int16x8_t __b) { return (int32x4_t)__builtin_aarch64_saddw2v8hi(__a, __b); } -FUNK int64x2_t vaddw_high_s32(int64x2_t __a, int32x4_t __b) { +__funline int64x2_t vaddw_high_s32(int64x2_t __a, int32x4_t __b) { return (int64x2_t)__builtin_aarch64_saddw2v4si(__a, __b); } -FUNK uint16x8_t vaddw_high_u8(uint16x8_t __a, uint8x16_t __b) { +__funline uint16x8_t vaddw_high_u8(uint16x8_t __a, uint8x16_t __b) { return (uint16x8_t)__builtin_aarch64_uaddw2v16qi((int16x8_t)__a, (int8x16_t)__b); } -FUNK uint32x4_t vaddw_high_u16(uint32x4_t __a, uint16x8_t __b) { +__funline uint32x4_t vaddw_high_u16(uint32x4_t __a, uint16x8_t __b) { return (uint32x4_t)__builtin_aarch64_uaddw2v8hi((int32x4_t)__a, (int16x8_t)__b); } -FUNK uint64x2_t vaddw_high_u32(uint64x2_t __a, uint32x4_t __b) { +__funline uint64x2_t vaddw_high_u32(uint64x2_t __a, uint32x4_t __b) { return (uint64x2_t)__builtin_aarch64_uaddw2v4si((int64x2_t)__a, (int32x4_t)__b); } -FUNK int8x8_t vhadd_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vhadd_s8(int8x8_t __a, int8x8_t __b) { return (int8x8_t)__builtin_aarch64_shaddv8qi(__a, __b); } -FUNK int16x4_t vhadd_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vhadd_s16(int16x4_t __a, int16x4_t __b) { return (int16x4_t)__builtin_aarch64_shaddv4hi(__a, __b); } -FUNK int32x2_t vhadd_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vhadd_s32(int32x2_t __a, int32x2_t __b) { return (int32x2_t)__builtin_aarch64_shaddv2si(__a, __b); } -FUNK uint8x8_t vhadd_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vhadd_u8(uint8x8_t __a, uint8x8_t __b) { return (uint8x8_t)__builtin_aarch64_uhaddv8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK uint16x4_t vhadd_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vhadd_u16(uint16x4_t __a, uint16x4_t __b) { return (uint16x4_t)__builtin_aarch64_uhaddv4hi((int16x4_t)__a, (int16x4_t)__b); } -FUNK uint32x2_t vhadd_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vhadd_u32(uint32x2_t __a, uint32x2_t __b) { return (uint32x2_t)__builtin_aarch64_uhaddv2si((int32x2_t)__a, (int32x2_t)__b); } -FUNK int8x16_t vhaddq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vhaddq_s8(int8x16_t __a, int8x16_t __b) { return (int8x16_t)__builtin_aarch64_shaddv16qi(__a, __b); } -FUNK int16x8_t vhaddq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vhaddq_s16(int16x8_t __a, int16x8_t __b) { return (int16x8_t)__builtin_aarch64_shaddv8hi(__a, __b); } -FUNK int32x4_t vhaddq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vhaddq_s32(int32x4_t __a, int32x4_t __b) { return (int32x4_t)__builtin_aarch64_shaddv4si(__a, __b); } -FUNK uint8x16_t vhaddq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vhaddq_u8(uint8x16_t __a, uint8x16_t __b) { return (uint8x16_t)__builtin_aarch64_uhaddv16qi((int8x16_t)__a, (int8x16_t)__b); } -FUNK uint16x8_t vhaddq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vhaddq_u16(uint16x8_t __a, uint16x8_t __b) { return (uint16x8_t)__builtin_aarch64_uhaddv8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint32x4_t vhaddq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vhaddq_u32(uint32x4_t __a, uint32x4_t __b) { return (uint32x4_t)__builtin_aarch64_uhaddv4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK int8x8_t vrhadd_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vrhadd_s8(int8x8_t __a, int8x8_t __b) { return (int8x8_t)__builtin_aarch64_srhaddv8qi(__a, __b); } -FUNK int16x4_t vrhadd_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vrhadd_s16(int16x4_t __a, int16x4_t __b) { return (int16x4_t)__builtin_aarch64_srhaddv4hi(__a, __b); } -FUNK int32x2_t vrhadd_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vrhadd_s32(int32x2_t __a, int32x2_t __b) { return (int32x2_t)__builtin_aarch64_srhaddv2si(__a, __b); } -FUNK uint8x8_t vrhadd_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vrhadd_u8(uint8x8_t __a, uint8x8_t __b) { return (uint8x8_t)__builtin_aarch64_urhaddv8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK uint16x4_t vrhadd_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vrhadd_u16(uint16x4_t __a, uint16x4_t __b) { return (uint16x4_t)__builtin_aarch64_urhaddv4hi((int16x4_t)__a, (int16x4_t)__b); } -FUNK uint32x2_t vrhadd_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vrhadd_u32(uint32x2_t __a, uint32x2_t __b) { return (uint32x2_t)__builtin_aarch64_urhaddv2si((int32x2_t)__a, (int32x2_t)__b); } -FUNK int8x16_t vrhaddq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vrhaddq_s8(int8x16_t __a, int8x16_t __b) { return (int8x16_t)__builtin_aarch64_srhaddv16qi(__a, __b); } -FUNK int16x8_t vrhaddq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vrhaddq_s16(int16x8_t __a, int16x8_t __b) { return (int16x8_t)__builtin_aarch64_srhaddv8hi(__a, __b); } -FUNK int32x4_t vrhaddq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vrhaddq_s32(int32x4_t __a, int32x4_t __b) { return (int32x4_t)__builtin_aarch64_srhaddv4si(__a, __b); } -FUNK uint8x16_t vrhaddq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vrhaddq_u8(uint8x16_t __a, uint8x16_t __b) { return (uint8x16_t)__builtin_aarch64_urhaddv16qi((int8x16_t)__a, (int8x16_t)__b); } -FUNK uint16x8_t vrhaddq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vrhaddq_u16(uint16x8_t __a, uint16x8_t __b) { return (uint16x8_t)__builtin_aarch64_urhaddv8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint32x4_t vrhaddq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vrhaddq_u32(uint32x4_t __a, uint32x4_t __b) { return (uint32x4_t)__builtin_aarch64_urhaddv4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK int8x8_t vaddhn_s16(int16x8_t __a, int16x8_t __b) { +__funline int8x8_t vaddhn_s16(int16x8_t __a, int16x8_t __b) { return (int8x8_t)__builtin_aarch64_addhnv8hi(__a, __b); } -FUNK int16x4_t vaddhn_s32(int32x4_t __a, int32x4_t __b) { +__funline int16x4_t vaddhn_s32(int32x4_t __a, int32x4_t __b) { return (int16x4_t)__builtin_aarch64_addhnv4si(__a, __b); } -FUNK int32x2_t vaddhn_s64(int64x2_t __a, int64x2_t __b) { +__funline int32x2_t vaddhn_s64(int64x2_t __a, int64x2_t __b) { return (int32x2_t)__builtin_aarch64_addhnv2di(__a, __b); } -FUNK uint8x8_t vaddhn_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint8x8_t vaddhn_u16(uint16x8_t __a, uint16x8_t __b) { return (uint8x8_t)__builtin_aarch64_addhnv8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint16x4_t vaddhn_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint16x4_t vaddhn_u32(uint32x4_t __a, uint32x4_t __b) { return (uint16x4_t)__builtin_aarch64_addhnv4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK uint32x2_t vaddhn_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint32x2_t vaddhn_u64(uint64x2_t __a, uint64x2_t __b) { return (uint32x2_t)__builtin_aarch64_addhnv2di((int64x2_t)__a, (int64x2_t)__b); } -FUNK int8x8_t vraddhn_s16(int16x8_t __a, int16x8_t __b) { +__funline int8x8_t vraddhn_s16(int16x8_t __a, int16x8_t __b) { return (int8x8_t)__builtin_aarch64_raddhnv8hi(__a, __b); } -FUNK int16x4_t vraddhn_s32(int32x4_t __a, int32x4_t __b) { +__funline int16x4_t vraddhn_s32(int32x4_t __a, int32x4_t __b) { return (int16x4_t)__builtin_aarch64_raddhnv4si(__a, __b); } -FUNK int32x2_t vraddhn_s64(int64x2_t __a, int64x2_t __b) { +__funline int32x2_t vraddhn_s64(int64x2_t __a, int64x2_t __b) { return (int32x2_t)__builtin_aarch64_raddhnv2di(__a, __b); } -FUNK uint8x8_t vraddhn_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint8x8_t vraddhn_u16(uint16x8_t __a, uint16x8_t __b) { return (uint8x8_t)__builtin_aarch64_raddhnv8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint16x4_t vraddhn_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint16x4_t vraddhn_u32(uint32x4_t __a, uint32x4_t __b) { return (uint16x4_t)__builtin_aarch64_raddhnv4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK uint32x2_t vraddhn_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint32x2_t vraddhn_u64(uint64x2_t __a, uint64x2_t __b) { return (uint32x2_t)__builtin_aarch64_raddhnv2di((int64x2_t)__a, (int64x2_t)__b); } -FUNK int8x16_t vaddhn_high_s16(int8x8_t __a, int16x8_t __b, int16x8_t __c) { +__funline int8x16_t vaddhn_high_s16(int8x8_t __a, int16x8_t __b, + int16x8_t __c) { return (int8x16_t)__builtin_aarch64_addhn2v8hi(__a, __b, __c); } -FUNK int16x8_t vaddhn_high_s32(int16x4_t __a, int32x4_t __b, int32x4_t __c) { +__funline int16x8_t vaddhn_high_s32(int16x4_t __a, int32x4_t __b, + int32x4_t __c) { return (int16x8_t)__builtin_aarch64_addhn2v4si(__a, __b, __c); } -FUNK int32x4_t vaddhn_high_s64(int32x2_t __a, int64x2_t __b, int64x2_t __c) { +__funline int32x4_t vaddhn_high_s64(int32x2_t __a, int64x2_t __b, + int64x2_t __c) { return (int32x4_t)__builtin_aarch64_addhn2v2di(__a, __b, __c); } -FUNK uint8x16_t vaddhn_high_u16(uint8x8_t __a, uint16x8_t __b, uint16x8_t __c) { +__funline uint8x16_t vaddhn_high_u16(uint8x8_t __a, uint16x8_t __b, + uint16x8_t __c) { return (uint8x16_t)__builtin_aarch64_addhn2v8hi((int8x8_t)__a, (int16x8_t)__b, (int16x8_t)__c); } -FUNK uint16x8_t vaddhn_high_u32(uint16x4_t __a, uint32x4_t __b, - uint32x4_t __c) { +__funline uint16x8_t vaddhn_high_u32(uint16x4_t __a, uint32x4_t __b, + uint32x4_t __c) { return (uint16x8_t)__builtin_aarch64_addhn2v4si( (int16x4_t)__a, (int32x4_t)__b, (int32x4_t)__c); } -FUNK uint32x4_t vaddhn_high_u64(uint32x2_t __a, uint64x2_t __b, - uint64x2_t __c) { +__funline uint32x4_t vaddhn_high_u64(uint32x2_t __a, uint64x2_t __b, + uint64x2_t __c) { return (uint32x4_t)__builtin_aarch64_addhn2v2di( (int32x2_t)__a, (int64x2_t)__b, (int64x2_t)__c); } -FUNK int8x16_t vraddhn_high_s16(int8x8_t __a, int16x8_t __b, int16x8_t __c) { +__funline int8x16_t vraddhn_high_s16(int8x8_t __a, int16x8_t __b, + int16x8_t __c) { return (int8x16_t)__builtin_aarch64_raddhn2v8hi(__a, __b, __c); } -FUNK int16x8_t vraddhn_high_s32(int16x4_t __a, int32x4_t __b, int32x4_t __c) { +__funline int16x8_t vraddhn_high_s32(int16x4_t __a, int32x4_t __b, + int32x4_t __c) { return (int16x8_t)__builtin_aarch64_raddhn2v4si(__a, __b, __c); } -FUNK int32x4_t vraddhn_high_s64(int32x2_t __a, int64x2_t __b, int64x2_t __c) { +__funline int32x4_t vraddhn_high_s64(int32x2_t __a, int64x2_t __b, + int64x2_t __c) { return (int32x4_t)__builtin_aarch64_raddhn2v2di(__a, __b, __c); } -FUNK uint8x16_t vraddhn_high_u16(uint8x8_t __a, uint16x8_t __b, - uint16x8_t __c) { +__funline uint8x16_t vraddhn_high_u16(uint8x8_t __a, uint16x8_t __b, + uint16x8_t __c) { return (uint8x16_t)__builtin_aarch64_raddhn2v8hi( (int8x8_t)__a, (int16x8_t)__b, (int16x8_t)__c); } -FUNK uint16x8_t vraddhn_high_u32(uint16x4_t __a, uint32x4_t __b, - uint32x4_t __c) { +__funline uint16x8_t vraddhn_high_u32(uint16x4_t __a, uint32x4_t __b, + uint32x4_t __c) { return (uint16x8_t)__builtin_aarch64_raddhn2v4si( (int16x4_t)__a, (int32x4_t)__b, (int32x4_t)__c); } -FUNK uint32x4_t vraddhn_high_u64(uint32x2_t __a, uint64x2_t __b, - uint64x2_t __c) { +__funline uint32x4_t vraddhn_high_u64(uint32x2_t __a, uint64x2_t __b, + uint64x2_t __c) { return (uint32x4_t)__builtin_aarch64_raddhn2v2di( (int32x2_t)__a, (int64x2_t)__b, (int64x2_t)__c); } -FUNK float32x2_t vdiv_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vdiv_f32(float32x2_t __a, float32x2_t __b) { return __a / __b; } -FUNK float64x1_t vdiv_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vdiv_f64(float64x1_t __a, float64x1_t __b) { return __a / __b; } -FUNK float32x4_t vdivq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vdivq_f32(float32x4_t __a, float32x4_t __b) { return __a / __b; } -FUNK float64x2_t vdivq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vdivq_f64(float64x2_t __a, float64x2_t __b) { return __a / __b; } -FUNK int8x8_t vmul_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vmul_s8(int8x8_t __a, int8x8_t __b) { return __a * __b; } -FUNK int16x4_t vmul_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vmul_s16(int16x4_t __a, int16x4_t __b) { return __a * __b; } -FUNK int32x2_t vmul_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vmul_s32(int32x2_t __a, int32x2_t __b) { return __a * __b; } -FUNK float32x2_t vmul_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vmul_f32(float32x2_t __a, float32x2_t __b) { return __a * __b; } -FUNK float64x1_t vmul_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vmul_f64(float64x1_t __a, float64x1_t __b) { return __a * __b; } -FUNK uint8x8_t vmul_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vmul_u8(uint8x8_t __a, uint8x8_t __b) { return __a * __b; } -FUNK uint16x4_t vmul_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vmul_u16(uint16x4_t __a, uint16x4_t __b) { return __a * __b; } -FUNK uint32x2_t vmul_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vmul_u32(uint32x2_t __a, uint32x2_t __b) { return __a * __b; } -FUNK poly8x8_t vmul_p8(poly8x8_t __a, poly8x8_t __b) { +__funline poly8x8_t vmul_p8(poly8x8_t __a, poly8x8_t __b) { return (poly8x8_t)__builtin_aarch64_pmulv8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK int8x16_t vmulq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vmulq_s8(int8x16_t __a, int8x16_t __b) { return __a * __b; } -FUNK int16x8_t vmulq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vmulq_s16(int16x8_t __a, int16x8_t __b) { return __a * __b; } -FUNK int32x4_t vmulq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vmulq_s32(int32x4_t __a, int32x4_t __b) { return __a * __b; } -FUNK float32x4_t vmulq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vmulq_f32(float32x4_t __a, float32x4_t __b) { return __a * __b; } -FUNK float64x2_t vmulq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vmulq_f64(float64x2_t __a, float64x2_t __b) { return __a * __b; } -FUNK uint8x16_t vmulq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vmulq_u8(uint8x16_t __a, uint8x16_t __b) { return __a * __b; } -FUNK uint16x8_t vmulq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vmulq_u16(uint16x8_t __a, uint16x8_t __b) { return __a * __b; } -FUNK uint32x4_t vmulq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vmulq_u32(uint32x4_t __a, uint32x4_t __b) { return __a * __b; } -FUNK poly8x16_t vmulq_p8(poly8x16_t __a, poly8x16_t __b) { +__funline poly8x16_t vmulq_p8(poly8x16_t __a, poly8x16_t __b) { return (poly8x16_t)__builtin_aarch64_pmulv16qi((int8x16_t)__a, (int8x16_t)__b); } -FUNK int8x8_t vand_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vand_s8(int8x8_t __a, int8x8_t __b) { return __a & __b; } -FUNK int16x4_t vand_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vand_s16(int16x4_t __a, int16x4_t __b) { return __a & __b; } -FUNK int32x2_t vand_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vand_s32(int32x2_t __a, int32x2_t __b) { return __a & __b; } -FUNK uint8x8_t vand_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vand_u8(uint8x8_t __a, uint8x8_t __b) { return __a & __b; } -FUNK uint16x4_t vand_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vand_u16(uint16x4_t __a, uint16x4_t __b) { return __a & __b; } -FUNK uint32x2_t vand_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vand_u32(uint32x2_t __a, uint32x2_t __b) { return __a & __b; } -FUNK int64x1_t vand_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vand_s64(int64x1_t __a, int64x1_t __b) { return __a & __b; } -FUNK uint64x1_t vand_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vand_u64(uint64x1_t __a, uint64x1_t __b) { return __a & __b; } -FUNK int8x16_t vandq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vandq_s8(int8x16_t __a, int8x16_t __b) { return __a & __b; } -FUNK int16x8_t vandq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vandq_s16(int16x8_t __a, int16x8_t __b) { return __a & __b; } -FUNK int32x4_t vandq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vandq_s32(int32x4_t __a, int32x4_t __b) { return __a & __b; } -FUNK int64x2_t vandq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vandq_s64(int64x2_t __a, int64x2_t __b) { return __a & __b; } -FUNK uint8x16_t vandq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vandq_u8(uint8x16_t __a, uint8x16_t __b) { return __a & __b; } -FUNK uint16x8_t vandq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vandq_u16(uint16x8_t __a, uint16x8_t __b) { return __a & __b; } -FUNK uint32x4_t vandq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vandq_u32(uint32x4_t __a, uint32x4_t __b) { return __a & __b; } -FUNK uint64x2_t vandq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vandq_u64(uint64x2_t __a, uint64x2_t __b) { return __a & __b; } -FUNK int8x8_t vorr_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vorr_s8(int8x8_t __a, int8x8_t __b) { return __a | __b; } -FUNK int16x4_t vorr_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vorr_s16(int16x4_t __a, int16x4_t __b) { return __a | __b; } -FUNK int32x2_t vorr_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vorr_s32(int32x2_t __a, int32x2_t __b) { return __a | __b; } -FUNK uint8x8_t vorr_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vorr_u8(uint8x8_t __a, uint8x8_t __b) { return __a | __b; } -FUNK uint16x4_t vorr_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vorr_u16(uint16x4_t __a, uint16x4_t __b) { return __a | __b; } -FUNK uint32x2_t vorr_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vorr_u32(uint32x2_t __a, uint32x2_t __b) { return __a | __b; } -FUNK int64x1_t vorr_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vorr_s64(int64x1_t __a, int64x1_t __b) { return __a | __b; } -FUNK uint64x1_t vorr_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vorr_u64(uint64x1_t __a, uint64x1_t __b) { return __a | __b; } -FUNK int8x16_t vorrq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vorrq_s8(int8x16_t __a, int8x16_t __b) { return __a | __b; } -FUNK int16x8_t vorrq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vorrq_s16(int16x8_t __a, int16x8_t __b) { return __a | __b; } -FUNK int32x4_t vorrq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vorrq_s32(int32x4_t __a, int32x4_t __b) { return __a | __b; } -FUNK int64x2_t vorrq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vorrq_s64(int64x2_t __a, int64x2_t __b) { return __a | __b; } -FUNK uint8x16_t vorrq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vorrq_u8(uint8x16_t __a, uint8x16_t __b) { return __a | __b; } -FUNK uint16x8_t vorrq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vorrq_u16(uint16x8_t __a, uint16x8_t __b) { return __a | __b; } -FUNK uint32x4_t vorrq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vorrq_u32(uint32x4_t __a, uint32x4_t __b) { return __a | __b; } -FUNK uint64x2_t vorrq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vorrq_u64(uint64x2_t __a, uint64x2_t __b) { return __a | __b; } -FUNK int8x8_t veor_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t veor_s8(int8x8_t __a, int8x8_t __b) { return __a ^ __b; } -FUNK int16x4_t veor_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t veor_s16(int16x4_t __a, int16x4_t __b) { return __a ^ __b; } -FUNK int32x2_t veor_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t veor_s32(int32x2_t __a, int32x2_t __b) { return __a ^ __b; } -FUNK uint8x8_t veor_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t veor_u8(uint8x8_t __a, uint8x8_t __b) { return __a ^ __b; } -FUNK uint16x4_t veor_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t veor_u16(uint16x4_t __a, uint16x4_t __b) { return __a ^ __b; } -FUNK uint32x2_t veor_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t veor_u32(uint32x2_t __a, uint32x2_t __b) { return __a ^ __b; } -FUNK int64x1_t veor_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t veor_s64(int64x1_t __a, int64x1_t __b) { return __a ^ __b; } -FUNK uint64x1_t veor_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t veor_u64(uint64x1_t __a, uint64x1_t __b) { return __a ^ __b; } -FUNK int8x16_t veorq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t veorq_s8(int8x16_t __a, int8x16_t __b) { return __a ^ __b; } -FUNK int16x8_t veorq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t veorq_s16(int16x8_t __a, int16x8_t __b) { return __a ^ __b; } -FUNK int32x4_t veorq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t veorq_s32(int32x4_t __a, int32x4_t __b) { return __a ^ __b; } -FUNK int64x2_t veorq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t veorq_s64(int64x2_t __a, int64x2_t __b) { return __a ^ __b; } -FUNK uint8x16_t veorq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t veorq_u8(uint8x16_t __a, uint8x16_t __b) { return __a ^ __b; } -FUNK uint16x8_t veorq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t veorq_u16(uint16x8_t __a, uint16x8_t __b) { return __a ^ __b; } -FUNK uint32x4_t veorq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t veorq_u32(uint32x4_t __a, uint32x4_t __b) { return __a ^ __b; } -FUNK uint64x2_t veorq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t veorq_u64(uint64x2_t __a, uint64x2_t __b) { return __a ^ __b; } -FUNK int8x8_t vbic_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vbic_s8(int8x8_t __a, int8x8_t __b) { return __a & ~__b; } -FUNK int16x4_t vbic_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vbic_s16(int16x4_t __a, int16x4_t __b) { return __a & ~__b; } -FUNK int32x2_t vbic_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vbic_s32(int32x2_t __a, int32x2_t __b) { return __a & ~__b; } -FUNK uint8x8_t vbic_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vbic_u8(uint8x8_t __a, uint8x8_t __b) { return __a & ~__b; } -FUNK uint16x4_t vbic_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vbic_u16(uint16x4_t __a, uint16x4_t __b) { return __a & ~__b; } -FUNK uint32x2_t vbic_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vbic_u32(uint32x2_t __a, uint32x2_t __b) { return __a & ~__b; } -FUNK int64x1_t vbic_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vbic_s64(int64x1_t __a, int64x1_t __b) { return __a & ~__b; } -FUNK uint64x1_t vbic_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vbic_u64(uint64x1_t __a, uint64x1_t __b) { return __a & ~__b; } -FUNK int8x16_t vbicq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vbicq_s8(int8x16_t __a, int8x16_t __b) { return __a & ~__b; } -FUNK int16x8_t vbicq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vbicq_s16(int16x8_t __a, int16x8_t __b) { return __a & ~__b; } -FUNK int32x4_t vbicq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vbicq_s32(int32x4_t __a, int32x4_t __b) { return __a & ~__b; } -FUNK int64x2_t vbicq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vbicq_s64(int64x2_t __a, int64x2_t __b) { return __a & ~__b; } -FUNK uint8x16_t vbicq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vbicq_u8(uint8x16_t __a, uint8x16_t __b) { return __a & ~__b; } -FUNK uint16x8_t vbicq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vbicq_u16(uint16x8_t __a, uint16x8_t __b) { return __a & ~__b; } -FUNK uint32x4_t vbicq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vbicq_u32(uint32x4_t __a, uint32x4_t __b) { return __a & ~__b; } -FUNK uint64x2_t vbicq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vbicq_u64(uint64x2_t __a, uint64x2_t __b) { return __a & ~__b; } -FUNK int8x8_t vorn_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vorn_s8(int8x8_t __a, int8x8_t __b) { return __a | ~__b; } -FUNK int16x4_t vorn_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vorn_s16(int16x4_t __a, int16x4_t __b) { return __a | ~__b; } -FUNK int32x2_t vorn_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vorn_s32(int32x2_t __a, int32x2_t __b) { return __a | ~__b; } -FUNK uint8x8_t vorn_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vorn_u8(uint8x8_t __a, uint8x8_t __b) { return __a | ~__b; } -FUNK uint16x4_t vorn_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vorn_u16(uint16x4_t __a, uint16x4_t __b) { return __a | ~__b; } -FUNK uint32x2_t vorn_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vorn_u32(uint32x2_t __a, uint32x2_t __b) { return __a | ~__b; } -FUNK int64x1_t vorn_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vorn_s64(int64x1_t __a, int64x1_t __b) { return __a | ~__b; } -FUNK uint64x1_t vorn_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vorn_u64(uint64x1_t __a, uint64x1_t __b) { return __a | ~__b; } -FUNK int8x16_t vornq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vornq_s8(int8x16_t __a, int8x16_t __b) { return __a | ~__b; } -FUNK int16x8_t vornq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vornq_s16(int16x8_t __a, int16x8_t __b) { return __a | ~__b; } -FUNK int32x4_t vornq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vornq_s32(int32x4_t __a, int32x4_t __b) { return __a | ~__b; } -FUNK int64x2_t vornq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vornq_s64(int64x2_t __a, int64x2_t __b) { return __a | ~__b; } -FUNK uint8x16_t vornq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vornq_u8(uint8x16_t __a, uint8x16_t __b) { return __a | ~__b; } -FUNK uint16x8_t vornq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vornq_u16(uint16x8_t __a, uint16x8_t __b) { return __a | ~__b; } -FUNK uint32x4_t vornq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vornq_u32(uint32x4_t __a, uint32x4_t __b) { return __a | ~__b; } -FUNK uint64x2_t vornq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vornq_u64(uint64x2_t __a, uint64x2_t __b) { return __a | ~__b; } -FUNK int8x8_t vsub_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vsub_s8(int8x8_t __a, int8x8_t __b) { return __a - __b; } -FUNK int16x4_t vsub_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vsub_s16(int16x4_t __a, int16x4_t __b) { return __a - __b; } -FUNK int32x2_t vsub_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vsub_s32(int32x2_t __a, int32x2_t __b) { return __a - __b; } -FUNK float32x2_t vsub_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vsub_f32(float32x2_t __a, float32x2_t __b) { return __a - __b; } -FUNK float64x1_t vsub_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vsub_f64(float64x1_t __a, float64x1_t __b) { return __a - __b; } -FUNK uint8x8_t vsub_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vsub_u8(uint8x8_t __a, uint8x8_t __b) { return __a - __b; } -FUNK uint16x4_t vsub_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vsub_u16(uint16x4_t __a, uint16x4_t __b) { return __a - __b; } -FUNK uint32x2_t vsub_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vsub_u32(uint32x2_t __a, uint32x2_t __b) { return __a - __b; } -FUNK int64x1_t vsub_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vsub_s64(int64x1_t __a, int64x1_t __b) { return __a - __b; } -FUNK uint64x1_t vsub_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vsub_u64(uint64x1_t __a, uint64x1_t __b) { return __a - __b; } -FUNK int8x16_t vsubq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vsubq_s8(int8x16_t __a, int8x16_t __b) { return __a - __b; } -FUNK int16x8_t vsubq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vsubq_s16(int16x8_t __a, int16x8_t __b) { return __a - __b; } -FUNK int32x4_t vsubq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vsubq_s32(int32x4_t __a, int32x4_t __b) { return __a - __b; } -FUNK int64x2_t vsubq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vsubq_s64(int64x2_t __a, int64x2_t __b) { return __a - __b; } -FUNK float32x4_t vsubq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vsubq_f32(float32x4_t __a, float32x4_t __b) { return __a - __b; } -FUNK float64x2_t vsubq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vsubq_f64(float64x2_t __a, float64x2_t __b) { return __a - __b; } -FUNK uint8x16_t vsubq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vsubq_u8(uint8x16_t __a, uint8x16_t __b) { return __a - __b; } -FUNK uint16x8_t vsubq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vsubq_u16(uint16x8_t __a, uint16x8_t __b) { return __a - __b; } -FUNK uint32x4_t vsubq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vsubq_u32(uint32x4_t __a, uint32x4_t __b) { return __a - __b; } -FUNK uint64x2_t vsubq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vsubq_u64(uint64x2_t __a, uint64x2_t __b) { return __a - __b; } -FUNK int16x8_t vsubl_s8(int8x8_t __a, int8x8_t __b) { +__funline int16x8_t vsubl_s8(int8x8_t __a, int8x8_t __b) { return (int16x8_t)__builtin_aarch64_ssublv8qi(__a, __b); } -FUNK int32x4_t vsubl_s16(int16x4_t __a, int16x4_t __b) { +__funline int32x4_t vsubl_s16(int16x4_t __a, int16x4_t __b) { return (int32x4_t)__builtin_aarch64_ssublv4hi(__a, __b); } -FUNK int64x2_t vsubl_s32(int32x2_t __a, int32x2_t __b) { +__funline int64x2_t vsubl_s32(int32x2_t __a, int32x2_t __b) { return (int64x2_t)__builtin_aarch64_ssublv2si(__a, __b); } -FUNK uint16x8_t vsubl_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint16x8_t vsubl_u8(uint8x8_t __a, uint8x8_t __b) { return (uint16x8_t)__builtin_aarch64_usublv8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK uint32x4_t vsubl_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint32x4_t vsubl_u16(uint16x4_t __a, uint16x4_t __b) { return (uint32x4_t)__builtin_aarch64_usublv4hi((int16x4_t)__a, (int16x4_t)__b); } -FUNK uint64x2_t vsubl_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint64x2_t vsubl_u32(uint32x2_t __a, uint32x2_t __b) { return (uint64x2_t)__builtin_aarch64_usublv2si((int32x2_t)__a, (int32x2_t)__b); } -FUNK int16x8_t vsubl_high_s8(int8x16_t __a, int8x16_t __b) { +__funline int16x8_t vsubl_high_s8(int8x16_t __a, int8x16_t __b) { return (int16x8_t)__builtin_aarch64_ssubl2v16qi(__a, __b); } -FUNK int32x4_t vsubl_high_s16(int16x8_t __a, int16x8_t __b) { +__funline int32x4_t vsubl_high_s16(int16x8_t __a, int16x8_t __b) { return (int32x4_t)__builtin_aarch64_ssubl2v8hi(__a, __b); } -FUNK int64x2_t vsubl_high_s32(int32x4_t __a, int32x4_t __b) { +__funline int64x2_t vsubl_high_s32(int32x4_t __a, int32x4_t __b) { return (int64x2_t)__builtin_aarch64_ssubl2v4si(__a, __b); } -FUNK uint16x8_t vsubl_high_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint16x8_t vsubl_high_u8(uint8x16_t __a, uint8x16_t __b) { return (uint16x8_t)__builtin_aarch64_usubl2v16qi((int8x16_t)__a, (int8x16_t)__b); } -FUNK uint32x4_t vsubl_high_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint32x4_t vsubl_high_u16(uint16x8_t __a, uint16x8_t __b) { return (uint32x4_t)__builtin_aarch64_usubl2v8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint64x2_t vsubl_high_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint64x2_t vsubl_high_u32(uint32x4_t __a, uint32x4_t __b) { return (uint64x2_t)__builtin_aarch64_usubl2v4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK int16x8_t vsubw_s8(int16x8_t __a, int8x8_t __b) { +__funline int16x8_t vsubw_s8(int16x8_t __a, int8x8_t __b) { return (int16x8_t)__builtin_aarch64_ssubwv8qi(__a, __b); } -FUNK int32x4_t vsubw_s16(int32x4_t __a, int16x4_t __b) { +__funline int32x4_t vsubw_s16(int32x4_t __a, int16x4_t __b) { return (int32x4_t)__builtin_aarch64_ssubwv4hi(__a, __b); } -FUNK int64x2_t vsubw_s32(int64x2_t __a, int32x2_t __b) { +__funline int64x2_t vsubw_s32(int64x2_t __a, int32x2_t __b) { return (int64x2_t)__builtin_aarch64_ssubwv2si(__a, __b); } -FUNK uint16x8_t vsubw_u8(uint16x8_t __a, uint8x8_t __b) { +__funline uint16x8_t vsubw_u8(uint16x8_t __a, uint8x8_t __b) { return (uint16x8_t)__builtin_aarch64_usubwv8qi((int16x8_t)__a, (int8x8_t)__b); } -FUNK uint32x4_t vsubw_u16(uint32x4_t __a, uint16x4_t __b) { +__funline uint32x4_t vsubw_u16(uint32x4_t __a, uint16x4_t __b) { return (uint32x4_t)__builtin_aarch64_usubwv4hi((int32x4_t)__a, (int16x4_t)__b); } -FUNK uint64x2_t vsubw_u32(uint64x2_t __a, uint32x2_t __b) { +__funline uint64x2_t vsubw_u32(uint64x2_t __a, uint32x2_t __b) { return (uint64x2_t)__builtin_aarch64_usubwv2si((int64x2_t)__a, (int32x2_t)__b); } -FUNK int16x8_t vsubw_high_s8(int16x8_t __a, int8x16_t __b) { +__funline int16x8_t vsubw_high_s8(int16x8_t __a, int8x16_t __b) { return (int16x8_t)__builtin_aarch64_ssubw2v16qi(__a, __b); } -FUNK int32x4_t vsubw_high_s16(int32x4_t __a, int16x8_t __b) { +__funline int32x4_t vsubw_high_s16(int32x4_t __a, int16x8_t __b) { return (int32x4_t)__builtin_aarch64_ssubw2v8hi(__a, __b); } -FUNK int64x2_t vsubw_high_s32(int64x2_t __a, int32x4_t __b) { +__funline int64x2_t vsubw_high_s32(int64x2_t __a, int32x4_t __b) { return (int64x2_t)__builtin_aarch64_ssubw2v4si(__a, __b); } -FUNK uint16x8_t vsubw_high_u8(uint16x8_t __a, uint8x16_t __b) { +__funline uint16x8_t vsubw_high_u8(uint16x8_t __a, uint8x16_t __b) { return (uint16x8_t)__builtin_aarch64_usubw2v16qi((int16x8_t)__a, (int8x16_t)__b); } -FUNK uint32x4_t vsubw_high_u16(uint32x4_t __a, uint16x8_t __b) { +__funline uint32x4_t vsubw_high_u16(uint32x4_t __a, uint16x8_t __b) { return (uint32x4_t)__builtin_aarch64_usubw2v8hi((int32x4_t)__a, (int16x8_t)__b); } -FUNK uint64x2_t vsubw_high_u32(uint64x2_t __a, uint32x4_t __b) { +__funline uint64x2_t vsubw_high_u32(uint64x2_t __a, uint32x4_t __b) { return (uint64x2_t)__builtin_aarch64_usubw2v4si((int64x2_t)__a, (int32x4_t)__b); } -FUNK int8x8_t vqadd_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vqadd_s8(int8x8_t __a, int8x8_t __b) { return (int8x8_t)__builtin_aarch64_sqaddv8qi(__a, __b); } -FUNK int16x4_t vqadd_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vqadd_s16(int16x4_t __a, int16x4_t __b) { return (int16x4_t)__builtin_aarch64_sqaddv4hi(__a, __b); } -FUNK int32x2_t vqadd_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vqadd_s32(int32x2_t __a, int32x2_t __b) { return (int32x2_t)__builtin_aarch64_sqaddv2si(__a, __b); } -FUNK int64x1_t vqadd_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vqadd_s64(int64x1_t __a, int64x1_t __b) { return (int64x1_t){__builtin_aarch64_sqadddi(__a[0], __b[0])}; } -FUNK uint8x8_t vqadd_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vqadd_u8(uint8x8_t __a, uint8x8_t __b) { return __builtin_aarch64_uqaddv8qi_uuu(__a, __b); } -FUNK int8x8_t vhsub_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vhsub_s8(int8x8_t __a, int8x8_t __b) { return (int8x8_t)__builtin_aarch64_shsubv8qi(__a, __b); } -FUNK int16x4_t vhsub_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vhsub_s16(int16x4_t __a, int16x4_t __b) { return (int16x4_t)__builtin_aarch64_shsubv4hi(__a, __b); } -FUNK int32x2_t vhsub_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vhsub_s32(int32x2_t __a, int32x2_t __b) { return (int32x2_t)__builtin_aarch64_shsubv2si(__a, __b); } -FUNK uint8x8_t vhsub_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vhsub_u8(uint8x8_t __a, uint8x8_t __b) { return (uint8x8_t)__builtin_aarch64_uhsubv8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK uint16x4_t vhsub_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vhsub_u16(uint16x4_t __a, uint16x4_t __b) { return (uint16x4_t)__builtin_aarch64_uhsubv4hi((int16x4_t)__a, (int16x4_t)__b); } -FUNK uint32x2_t vhsub_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vhsub_u32(uint32x2_t __a, uint32x2_t __b) { return (uint32x2_t)__builtin_aarch64_uhsubv2si((int32x2_t)__a, (int32x2_t)__b); } -FUNK int8x16_t vhsubq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vhsubq_s8(int8x16_t __a, int8x16_t __b) { return (int8x16_t)__builtin_aarch64_shsubv16qi(__a, __b); } -FUNK int16x8_t vhsubq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vhsubq_s16(int16x8_t __a, int16x8_t __b) { return (int16x8_t)__builtin_aarch64_shsubv8hi(__a, __b); } -FUNK int32x4_t vhsubq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vhsubq_s32(int32x4_t __a, int32x4_t __b) { return (int32x4_t)__builtin_aarch64_shsubv4si(__a, __b); } -FUNK uint8x16_t vhsubq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vhsubq_u8(uint8x16_t __a, uint8x16_t __b) { return (uint8x16_t)__builtin_aarch64_uhsubv16qi((int8x16_t)__a, (int8x16_t)__b); } -FUNK uint16x8_t vhsubq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vhsubq_u16(uint16x8_t __a, uint16x8_t __b) { return (uint16x8_t)__builtin_aarch64_uhsubv8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint32x4_t vhsubq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vhsubq_u32(uint32x4_t __a, uint32x4_t __b) { return (uint32x4_t)__builtin_aarch64_uhsubv4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK int8x8_t vsubhn_s16(int16x8_t __a, int16x8_t __b) { +__funline int8x8_t vsubhn_s16(int16x8_t __a, int16x8_t __b) { return (int8x8_t)__builtin_aarch64_subhnv8hi(__a, __b); } -FUNK int16x4_t vsubhn_s32(int32x4_t __a, int32x4_t __b) { +__funline int16x4_t vsubhn_s32(int32x4_t __a, int32x4_t __b) { return (int16x4_t)__builtin_aarch64_subhnv4si(__a, __b); } -FUNK int32x2_t vsubhn_s64(int64x2_t __a, int64x2_t __b) { +__funline int32x2_t vsubhn_s64(int64x2_t __a, int64x2_t __b) { return (int32x2_t)__builtin_aarch64_subhnv2di(__a, __b); } -FUNK uint8x8_t vsubhn_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint8x8_t vsubhn_u16(uint16x8_t __a, uint16x8_t __b) { return (uint8x8_t)__builtin_aarch64_subhnv8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint16x4_t vsubhn_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint16x4_t vsubhn_u32(uint32x4_t __a, uint32x4_t __b) { return (uint16x4_t)__builtin_aarch64_subhnv4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK uint32x2_t vsubhn_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint32x2_t vsubhn_u64(uint64x2_t __a, uint64x2_t __b) { return (uint32x2_t)__builtin_aarch64_subhnv2di((int64x2_t)__a, (int64x2_t)__b); } -FUNK int8x8_t vrsubhn_s16(int16x8_t __a, int16x8_t __b) { +__funline int8x8_t vrsubhn_s16(int16x8_t __a, int16x8_t __b) { return (int8x8_t)__builtin_aarch64_rsubhnv8hi(__a, __b); } -FUNK int16x4_t vrsubhn_s32(int32x4_t __a, int32x4_t __b) { +__funline int16x4_t vrsubhn_s32(int32x4_t __a, int32x4_t __b) { return (int16x4_t)__builtin_aarch64_rsubhnv4si(__a, __b); } -FUNK int32x2_t vrsubhn_s64(int64x2_t __a, int64x2_t __b) { +__funline int32x2_t vrsubhn_s64(int64x2_t __a, int64x2_t __b) { return (int32x2_t)__builtin_aarch64_rsubhnv2di(__a, __b); } -FUNK uint8x8_t vrsubhn_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint8x8_t vrsubhn_u16(uint16x8_t __a, uint16x8_t __b) { return (uint8x8_t)__builtin_aarch64_rsubhnv8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint16x4_t vrsubhn_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint16x4_t vrsubhn_u32(uint32x4_t __a, uint32x4_t __b) { return (uint16x4_t)__builtin_aarch64_rsubhnv4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK uint32x2_t vrsubhn_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint32x2_t vrsubhn_u64(uint64x2_t __a, uint64x2_t __b) { return (uint32x2_t)__builtin_aarch64_rsubhnv2di((int64x2_t)__a, (int64x2_t)__b); } -FUNK int8x16_t vrsubhn_high_s16(int8x8_t __a, int16x8_t __b, int16x8_t __c) { +__funline int8x16_t vrsubhn_high_s16(int8x8_t __a, int16x8_t __b, + int16x8_t __c) { return (int8x16_t)__builtin_aarch64_rsubhn2v8hi(__a, __b, __c); } -FUNK int16x8_t vrsubhn_high_s32(int16x4_t __a, int32x4_t __b, int32x4_t __c) { +__funline int16x8_t vrsubhn_high_s32(int16x4_t __a, int32x4_t __b, + int32x4_t __c) { return (int16x8_t)__builtin_aarch64_rsubhn2v4si(__a, __b, __c); } -FUNK int32x4_t vrsubhn_high_s64(int32x2_t __a, int64x2_t __b, int64x2_t __c) { +__funline int32x4_t vrsubhn_high_s64(int32x2_t __a, int64x2_t __b, + int64x2_t __c) { return (int32x4_t)__builtin_aarch64_rsubhn2v2di(__a, __b, __c); } -FUNK uint8x16_t vrsubhn_high_u16(uint8x8_t __a, uint16x8_t __b, - uint16x8_t __c) { +__funline uint8x16_t vrsubhn_high_u16(uint8x8_t __a, uint16x8_t __b, + uint16x8_t __c) { return (uint8x16_t)__builtin_aarch64_rsubhn2v8hi( (int8x8_t)__a, (int16x8_t)__b, (int16x8_t)__c); } -FUNK uint16x8_t vrsubhn_high_u32(uint16x4_t __a, uint32x4_t __b, - uint32x4_t __c) { +__funline uint16x8_t vrsubhn_high_u32(uint16x4_t __a, uint32x4_t __b, + uint32x4_t __c) { return (uint16x8_t)__builtin_aarch64_rsubhn2v4si( (int16x4_t)__a, (int32x4_t)__b, (int32x4_t)__c); } -FUNK uint32x4_t vrsubhn_high_u64(uint32x2_t __a, uint64x2_t __b, - uint64x2_t __c) { +__funline uint32x4_t vrsubhn_high_u64(uint32x2_t __a, uint64x2_t __b, + uint64x2_t __c) { return (uint32x4_t)__builtin_aarch64_rsubhn2v2di( (int32x2_t)__a, (int64x2_t)__b, (int64x2_t)__c); } -FUNK int8x16_t vsubhn_high_s16(int8x8_t __a, int16x8_t __b, int16x8_t __c) { +__funline int8x16_t vsubhn_high_s16(int8x8_t __a, int16x8_t __b, + int16x8_t __c) { return (int8x16_t)__builtin_aarch64_subhn2v8hi(__a, __b, __c); } -FUNK int16x8_t vsubhn_high_s32(int16x4_t __a, int32x4_t __b, int32x4_t __c) { +__funline int16x8_t vsubhn_high_s32(int16x4_t __a, int32x4_t __b, + int32x4_t __c) { return (int16x8_t)__builtin_aarch64_subhn2v4si(__a, __b, __c); ; } -FUNK int32x4_t vsubhn_high_s64(int32x2_t __a, int64x2_t __b, int64x2_t __c) { +__funline int32x4_t vsubhn_high_s64(int32x2_t __a, int64x2_t __b, + int64x2_t __c) { return (int32x4_t)__builtin_aarch64_subhn2v2di(__a, __b, __c); } -FUNK uint8x16_t vsubhn_high_u16(uint8x8_t __a, uint16x8_t __b, uint16x8_t __c) { +__funline uint8x16_t vsubhn_high_u16(uint8x8_t __a, uint16x8_t __b, + uint16x8_t __c) { return (uint8x16_t)__builtin_aarch64_subhn2v8hi((int8x8_t)__a, (int16x8_t)__b, (int16x8_t)__c); } -FUNK uint16x8_t vsubhn_high_u32(uint16x4_t __a, uint32x4_t __b, - uint32x4_t __c) { +__funline uint16x8_t vsubhn_high_u32(uint16x4_t __a, uint32x4_t __b, + uint32x4_t __c) { return (uint16x8_t)__builtin_aarch64_subhn2v4si( (int16x4_t)__a, (int32x4_t)__b, (int32x4_t)__c); } -FUNK uint32x4_t vsubhn_high_u64(uint32x2_t __a, uint64x2_t __b, - uint64x2_t __c) { +__funline uint32x4_t vsubhn_high_u64(uint32x2_t __a, uint64x2_t __b, + uint64x2_t __c) { return (uint32x4_t)__builtin_aarch64_subhn2v2di( (int32x2_t)__a, (int64x2_t)__b, (int64x2_t)__c); } -FUNK uint16x4_t vqadd_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vqadd_u16(uint16x4_t __a, uint16x4_t __b) { return __builtin_aarch64_uqaddv4hi_uuu(__a, __b); } -FUNK uint32x2_t vqadd_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vqadd_u32(uint32x2_t __a, uint32x2_t __b) { return __builtin_aarch64_uqaddv2si_uuu(__a, __b); } -FUNK uint64x1_t vqadd_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vqadd_u64(uint64x1_t __a, uint64x1_t __b) { return (uint64x1_t){__builtin_aarch64_uqadddi_uuu(__a[0], __b[0])}; } -FUNK int8x16_t vqaddq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vqaddq_s8(int8x16_t __a, int8x16_t __b) { return (int8x16_t)__builtin_aarch64_sqaddv16qi(__a, __b); } -FUNK int16x8_t vqaddq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vqaddq_s16(int16x8_t __a, int16x8_t __b) { return (int16x8_t)__builtin_aarch64_sqaddv8hi(__a, __b); } -FUNK int32x4_t vqaddq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vqaddq_s32(int32x4_t __a, int32x4_t __b) { return (int32x4_t)__builtin_aarch64_sqaddv4si(__a, __b); } -FUNK int64x2_t vqaddq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vqaddq_s64(int64x2_t __a, int64x2_t __b) { return (int64x2_t)__builtin_aarch64_sqaddv2di(__a, __b); } -FUNK uint8x16_t vqaddq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vqaddq_u8(uint8x16_t __a, uint8x16_t __b) { return __builtin_aarch64_uqaddv16qi_uuu(__a, __b); } -FUNK uint16x8_t vqaddq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vqaddq_u16(uint16x8_t __a, uint16x8_t __b) { return __builtin_aarch64_uqaddv8hi_uuu(__a, __b); } -FUNK uint32x4_t vqaddq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vqaddq_u32(uint32x4_t __a, uint32x4_t __b) { return __builtin_aarch64_uqaddv4si_uuu(__a, __b); } -FUNK uint64x2_t vqaddq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vqaddq_u64(uint64x2_t __a, uint64x2_t __b) { return __builtin_aarch64_uqaddv2di_uuu(__a, __b); } -FUNK int8x8_t vqsub_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vqsub_s8(int8x8_t __a, int8x8_t __b) { return (int8x8_t)__builtin_aarch64_sqsubv8qi(__a, __b); } -FUNK int16x4_t vqsub_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vqsub_s16(int16x4_t __a, int16x4_t __b) { return (int16x4_t)__builtin_aarch64_sqsubv4hi(__a, __b); } -FUNK int32x2_t vqsub_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vqsub_s32(int32x2_t __a, int32x2_t __b) { return (int32x2_t)__builtin_aarch64_sqsubv2si(__a, __b); } -FUNK int64x1_t vqsub_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vqsub_s64(int64x1_t __a, int64x1_t __b) { return (int64x1_t){__builtin_aarch64_sqsubdi(__a[0], __b[0])}; } -FUNK uint8x8_t vqsub_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vqsub_u8(uint8x8_t __a, uint8x8_t __b) { return __builtin_aarch64_uqsubv8qi_uuu(__a, __b); } -FUNK uint16x4_t vqsub_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vqsub_u16(uint16x4_t __a, uint16x4_t __b) { return __builtin_aarch64_uqsubv4hi_uuu(__a, __b); } -FUNK uint32x2_t vqsub_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vqsub_u32(uint32x2_t __a, uint32x2_t __b) { return __builtin_aarch64_uqsubv2si_uuu(__a, __b); } -FUNK uint64x1_t vqsub_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vqsub_u64(uint64x1_t __a, uint64x1_t __b) { return (uint64x1_t){__builtin_aarch64_uqsubdi_uuu(__a[0], __b[0])}; } -FUNK int8x16_t vqsubq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vqsubq_s8(int8x16_t __a, int8x16_t __b) { return (int8x16_t)__builtin_aarch64_sqsubv16qi(__a, __b); } -FUNK int16x8_t vqsubq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vqsubq_s16(int16x8_t __a, int16x8_t __b) { return (int16x8_t)__builtin_aarch64_sqsubv8hi(__a, __b); } -FUNK int32x4_t vqsubq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vqsubq_s32(int32x4_t __a, int32x4_t __b) { return (int32x4_t)__builtin_aarch64_sqsubv4si(__a, __b); } -FUNK int64x2_t vqsubq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vqsubq_s64(int64x2_t __a, int64x2_t __b) { return (int64x2_t)__builtin_aarch64_sqsubv2di(__a, __b); } -FUNK uint8x16_t vqsubq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vqsubq_u8(uint8x16_t __a, uint8x16_t __b) { return __builtin_aarch64_uqsubv16qi_uuu(__a, __b); } -FUNK uint16x8_t vqsubq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vqsubq_u16(uint16x8_t __a, uint16x8_t __b) { return __builtin_aarch64_uqsubv8hi_uuu(__a, __b); } -FUNK uint32x4_t vqsubq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vqsubq_u32(uint32x4_t __a, uint32x4_t __b) { return __builtin_aarch64_uqsubv4si_uuu(__a, __b); } -FUNK uint64x2_t vqsubq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vqsubq_u64(uint64x2_t __a, uint64x2_t __b) { return __builtin_aarch64_uqsubv2di_uuu(__a, __b); } -FUNK int8x8_t vqneg_s8(int8x8_t __a) { +__funline int8x8_t vqneg_s8(int8x8_t __a) { return (int8x8_t)__builtin_aarch64_sqnegv8qi(__a); } -FUNK int16x4_t vqneg_s16(int16x4_t __a) { +__funline int16x4_t vqneg_s16(int16x4_t __a) { return (int16x4_t)__builtin_aarch64_sqnegv4hi(__a); } -FUNK int32x2_t vqneg_s32(int32x2_t __a) { +__funline int32x2_t vqneg_s32(int32x2_t __a) { return (int32x2_t)__builtin_aarch64_sqnegv2si(__a); } -FUNK int64x1_t vqneg_s64(int64x1_t __a) { +__funline int64x1_t vqneg_s64(int64x1_t __a) { return (int64x1_t){__builtin_aarch64_sqnegdi(__a[0])}; } -FUNK int8x16_t vqnegq_s8(int8x16_t __a) { +__funline int8x16_t vqnegq_s8(int8x16_t __a) { return (int8x16_t)__builtin_aarch64_sqnegv16qi(__a); } -FUNK int16x8_t vqnegq_s16(int16x8_t __a) { +__funline int16x8_t vqnegq_s16(int16x8_t __a) { return (int16x8_t)__builtin_aarch64_sqnegv8hi(__a); } -FUNK int32x4_t vqnegq_s32(int32x4_t __a) { +__funline int32x4_t vqnegq_s32(int32x4_t __a) { return (int32x4_t)__builtin_aarch64_sqnegv4si(__a); } -FUNK int8x8_t vqabs_s8(int8x8_t __a) { +__funline int8x8_t vqabs_s8(int8x8_t __a) { return (int8x8_t)__builtin_aarch64_sqabsv8qi(__a); } -FUNK int16x4_t vqabs_s16(int16x4_t __a) { +__funline int16x4_t vqabs_s16(int16x4_t __a) { return (int16x4_t)__builtin_aarch64_sqabsv4hi(__a); } -FUNK int32x2_t vqabs_s32(int32x2_t __a) { +__funline int32x2_t vqabs_s32(int32x2_t __a) { return (int32x2_t)__builtin_aarch64_sqabsv2si(__a); } -FUNK int64x1_t vqabs_s64(int64x1_t __a) { +__funline int64x1_t vqabs_s64(int64x1_t __a) { return (int64x1_t){__builtin_aarch64_sqabsdi(__a[0])}; } -FUNK int8x16_t vqabsq_s8(int8x16_t __a) { +__funline int8x16_t vqabsq_s8(int8x16_t __a) { return (int8x16_t)__builtin_aarch64_sqabsv16qi(__a); } -FUNK int16x8_t vqabsq_s16(int16x8_t __a) { +__funline int16x8_t vqabsq_s16(int16x8_t __a) { return (int16x8_t)__builtin_aarch64_sqabsv8hi(__a); } -FUNK int32x4_t vqabsq_s32(int32x4_t __a) { +__funline int32x4_t vqabsq_s32(int32x4_t __a) { return (int32x4_t)__builtin_aarch64_sqabsv4si(__a); } -FUNK int16x4_t vqdmulh_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vqdmulh_s16(int16x4_t __a, int16x4_t __b) { return (int16x4_t)__builtin_aarch64_sqdmulhv4hi(__a, __b); } -FUNK int32x2_t vqdmulh_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vqdmulh_s32(int32x2_t __a, int32x2_t __b) { return (int32x2_t)__builtin_aarch64_sqdmulhv2si(__a, __b); } -FUNK int16x8_t vqdmulhq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vqdmulhq_s16(int16x8_t __a, int16x8_t __b) { return (int16x8_t)__builtin_aarch64_sqdmulhv8hi(__a, __b); } -FUNK int32x4_t vqdmulhq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vqdmulhq_s32(int32x4_t __a, int32x4_t __b) { return (int32x4_t)__builtin_aarch64_sqdmulhv4si(__a, __b); } -FUNK int16x4_t vqrdmulh_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vqrdmulh_s16(int16x4_t __a, int16x4_t __b) { return (int16x4_t)__builtin_aarch64_sqrdmulhv4hi(__a, __b); } -FUNK int32x2_t vqrdmulh_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vqrdmulh_s32(int32x2_t __a, int32x2_t __b) { return (int32x2_t)__builtin_aarch64_sqrdmulhv2si(__a, __b); } -FUNK int16x8_t vqrdmulhq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vqrdmulhq_s16(int16x8_t __a, int16x8_t __b) { return (int16x8_t)__builtin_aarch64_sqrdmulhv8hi(__a, __b); } -FUNK int32x4_t vqrdmulhq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vqrdmulhq_s32(int32x4_t __a, int32x4_t __b) { return (int32x4_t)__builtin_aarch64_sqrdmulhv4si(__a, __b); } -FUNK int8x8_t vcreate_s8(uint64_t __a) { +__funline int8x8_t vcreate_s8(uint64_t __a) { return (int8x8_t)__a; } -FUNK int16x4_t vcreate_s16(uint64_t __a) { +__funline int16x4_t vcreate_s16(uint64_t __a) { return (int16x4_t)__a; } -FUNK int32x2_t vcreate_s32(uint64_t __a) { +__funline int32x2_t vcreate_s32(uint64_t __a) { return (int32x2_t)__a; } -FUNK int64x1_t vcreate_s64(uint64_t __a) { +__funline int64x1_t vcreate_s64(uint64_t __a) { return (int64x1_t){__a}; } -FUNK float16x4_t vcreate_f16(uint64_t __a) { +__funline float16x4_t vcreate_f16(uint64_t __a) { return (float16x4_t)__a; } -FUNK float32x2_t vcreate_f32(uint64_t __a) { +__funline float32x2_t vcreate_f32(uint64_t __a) { return (float32x2_t)__a; } -FUNK uint8x8_t vcreate_u8(uint64_t __a) { +__funline uint8x8_t vcreate_u8(uint64_t __a) { return (uint8x8_t)__a; } -FUNK uint16x4_t vcreate_u16(uint64_t __a) { +__funline uint16x4_t vcreate_u16(uint64_t __a) { return (uint16x4_t)__a; } -FUNK uint32x2_t vcreate_u32(uint64_t __a) { +__funline uint32x2_t vcreate_u32(uint64_t __a) { return (uint32x2_t)__a; } -FUNK uint64x1_t vcreate_u64(uint64_t __a) { +__funline uint64x1_t vcreate_u64(uint64_t __a) { return (uint64x1_t){__a}; } -FUNK float64x1_t vcreate_f64(uint64_t __a) { +__funline float64x1_t vcreate_f64(uint64_t __a) { return (float64x1_t)__a; } -FUNK poly8x8_t vcreate_p8(uint64_t __a) { +__funline poly8x8_t vcreate_p8(uint64_t __a) { return (poly8x8_t)__a; } -FUNK poly16x4_t vcreate_p16(uint64_t __a) { +__funline poly16x4_t vcreate_p16(uint64_t __a) { return (poly16x4_t)__a; } -FUNK poly64x1_t vcreate_p64(uint64_t __a) { +__funline poly64x1_t vcreate_p64(uint64_t __a) { return (poly64x1_t)__a; } -FUNK float16_t vget_lane_f16(float16x4_t __a, const int __b) { +__funline float16_t vget_lane_f16(float16x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float32_t vget_lane_f32(float32x2_t __a, const int __b) { +__funline float32_t vget_lane_f32(float32x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float64_t vget_lane_f64(float64x1_t __a, const int __b) { +__funline float64_t vget_lane_f64(float64x1_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK poly8_t vget_lane_p8(poly8x8_t __a, const int __b) { +__funline poly8_t vget_lane_p8(poly8x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK poly16_t vget_lane_p16(poly16x4_t __a, const int __b) { +__funline poly16_t vget_lane_p16(poly16x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK poly64_t vget_lane_p64(poly64x1_t __a, const int __b) { +__funline poly64_t vget_lane_p64(poly64x1_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int8_t vget_lane_s8(int8x8_t __a, const int __b) { +__funline int8_t vget_lane_s8(int8x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int16_t vget_lane_s16(int16x4_t __a, const int __b) { +__funline int16_t vget_lane_s16(int16x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int32_t vget_lane_s32(int32x2_t __a, const int __b) { +__funline int32_t vget_lane_s32(int32x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int64_t vget_lane_s64(int64x1_t __a, const int __b) { +__funline int64_t vget_lane_s64(int64x1_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint8_t vget_lane_u8(uint8x8_t __a, const int __b) { +__funline uint8_t vget_lane_u8(uint8x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint16_t vget_lane_u16(uint16x4_t __a, const int __b) { +__funline uint16_t vget_lane_u16(uint16x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint32_t vget_lane_u32(uint32x2_t __a, const int __b) { +__funline uint32_t vget_lane_u32(uint32x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint64_t vget_lane_u64(uint64x1_t __a, const int __b) { +__funline uint64_t vget_lane_u64(uint64x1_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float16_t vgetq_lane_f16(float16x8_t __a, const int __b) { +__funline float16_t vgetq_lane_f16(float16x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float32_t vgetq_lane_f32(float32x4_t __a, const int __b) { +__funline float32_t vgetq_lane_f32(float32x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float64_t vgetq_lane_f64(float64x2_t __a, const int __b) { +__funline float64_t vgetq_lane_f64(float64x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK poly8_t vgetq_lane_p8(poly8x16_t __a, const int __b) { +__funline poly8_t vgetq_lane_p8(poly8x16_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK poly16_t vgetq_lane_p16(poly16x8_t __a, const int __b) { +__funline poly16_t vgetq_lane_p16(poly16x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK poly64_t vgetq_lane_p64(poly64x2_t __a, const int __b) { +__funline poly64_t vgetq_lane_p64(poly64x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int8_t vgetq_lane_s8(int8x16_t __a, const int __b) { +__funline int8_t vgetq_lane_s8(int8x16_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int16_t vgetq_lane_s16(int16x8_t __a, const int __b) { +__funline int16_t vgetq_lane_s16(int16x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int32_t vgetq_lane_s32(int32x4_t __a, const int __b) { +__funline int32_t vgetq_lane_s32(int32x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int64_t vgetq_lane_s64(int64x2_t __a, const int __b) { +__funline int64_t vgetq_lane_s64(int64x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint8_t vgetq_lane_u8(uint8x16_t __a, const int __b) { +__funline uint8_t vgetq_lane_u8(uint8x16_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint16_t vgetq_lane_u16(uint16x8_t __a, const int __b) { +__funline uint16_t vgetq_lane_u16(uint16x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint32_t vgetq_lane_u32(uint32x4_t __a, const int __b) { +__funline uint32_t vgetq_lane_u32(uint32x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint64_t vgetq_lane_u64(uint64x2_t __a, const int __b) { +__funline uint64_t vgetq_lane_u64(uint64x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK poly8x8_t vreinterpret_p8_f16(float16x4_t __a) { +__funline poly8x8_t vreinterpret_p8_f16(float16x4_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_f64(float64x1_t __a) { +__funline poly8x8_t vreinterpret_p8_f64(float64x1_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_s8(int8x8_t __a) { +__funline poly8x8_t vreinterpret_p8_s8(int8x8_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_s16(int16x4_t __a) { +__funline poly8x8_t vreinterpret_p8_s16(int16x4_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_s32(int32x2_t __a) { +__funline poly8x8_t vreinterpret_p8_s32(int32x2_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_s64(int64x1_t __a) { +__funline poly8x8_t vreinterpret_p8_s64(int64x1_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_f32(float32x2_t __a) { +__funline poly8x8_t vreinterpret_p8_f32(float32x2_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_u8(uint8x8_t __a) { +__funline poly8x8_t vreinterpret_p8_u8(uint8x8_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_u16(uint16x4_t __a) { +__funline poly8x8_t vreinterpret_p8_u16(uint16x4_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_u32(uint32x2_t __a) { +__funline poly8x8_t vreinterpret_p8_u32(uint32x2_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_u64(uint64x1_t __a) { +__funline poly8x8_t vreinterpret_p8_u64(uint64x1_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_p16(poly16x4_t __a) { +__funline poly8x8_t vreinterpret_p8_p16(poly16x4_t __a) { return (poly8x8_t)__a; } -FUNK poly8x8_t vreinterpret_p8_p64(poly64x1_t __a) { +__funline poly8x8_t vreinterpret_p8_p64(poly64x1_t __a) { return (poly8x8_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_f64(float64x2_t __a) { +__funline poly8x16_t vreinterpretq_p8_f64(float64x2_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_s8(int8x16_t __a) { +__funline poly8x16_t vreinterpretq_p8_s8(int8x16_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_s16(int16x8_t __a) { +__funline poly8x16_t vreinterpretq_p8_s16(int16x8_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_s32(int32x4_t __a) { +__funline poly8x16_t vreinterpretq_p8_s32(int32x4_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_s64(int64x2_t __a) { +__funline poly8x16_t vreinterpretq_p8_s64(int64x2_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_f16(float16x8_t __a) { +__funline poly8x16_t vreinterpretq_p8_f16(float16x8_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_f32(float32x4_t __a) { +__funline poly8x16_t vreinterpretq_p8_f32(float32x4_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_u8(uint8x16_t __a) { +__funline poly8x16_t vreinterpretq_p8_u8(uint8x16_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_u16(uint16x8_t __a) { +__funline poly8x16_t vreinterpretq_p8_u16(uint16x8_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_u32(uint32x4_t __a) { +__funline poly8x16_t vreinterpretq_p8_u32(uint32x4_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_u64(uint64x2_t __a) { +__funline poly8x16_t vreinterpretq_p8_u64(uint64x2_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_p16(poly16x8_t __a) { +__funline poly8x16_t vreinterpretq_p8_p16(poly16x8_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_p64(poly64x2_t __a) { +__funline poly8x16_t vreinterpretq_p8_p64(poly64x2_t __a) { return (poly8x16_t)__a; } -FUNK poly8x16_t vreinterpretq_p8_p128(poly128_t __a) { +__funline poly8x16_t vreinterpretq_p8_p128(poly128_t __a) { return (poly8x16_t)__a; } -FUNK poly16x4_t vreinterpret_p16_f16(float16x4_t __a) { +__funline poly16x4_t vreinterpret_p16_f16(float16x4_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_f64(float64x1_t __a) { +__funline poly16x4_t vreinterpret_p16_f64(float64x1_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_s8(int8x8_t __a) { +__funline poly16x4_t vreinterpret_p16_s8(int8x8_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_s16(int16x4_t __a) { +__funline poly16x4_t vreinterpret_p16_s16(int16x4_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_s32(int32x2_t __a) { +__funline poly16x4_t vreinterpret_p16_s32(int32x2_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_s64(int64x1_t __a) { +__funline poly16x4_t vreinterpret_p16_s64(int64x1_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_f32(float32x2_t __a) { +__funline poly16x4_t vreinterpret_p16_f32(float32x2_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_u8(uint8x8_t __a) { +__funline poly16x4_t vreinterpret_p16_u8(uint8x8_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_u16(uint16x4_t __a) { +__funline poly16x4_t vreinterpret_p16_u16(uint16x4_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_u32(uint32x2_t __a) { +__funline poly16x4_t vreinterpret_p16_u32(uint32x2_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_u64(uint64x1_t __a) { +__funline poly16x4_t vreinterpret_p16_u64(uint64x1_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_p8(poly8x8_t __a) { +__funline poly16x4_t vreinterpret_p16_p8(poly8x8_t __a) { return (poly16x4_t)__a; } -FUNK poly16x4_t vreinterpret_p16_p64(poly64x1_t __a) { +__funline poly16x4_t vreinterpret_p16_p64(poly64x1_t __a) { return (poly16x4_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_f64(float64x2_t __a) { +__funline poly16x8_t vreinterpretq_p16_f64(float64x2_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_s8(int8x16_t __a) { +__funline poly16x8_t vreinterpretq_p16_s8(int8x16_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_s16(int16x8_t __a) { +__funline poly16x8_t vreinterpretq_p16_s16(int16x8_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_s32(int32x4_t __a) { +__funline poly16x8_t vreinterpretq_p16_s32(int32x4_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_s64(int64x2_t __a) { +__funline poly16x8_t vreinterpretq_p16_s64(int64x2_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_f16(float16x8_t __a) { +__funline poly16x8_t vreinterpretq_p16_f16(float16x8_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_f32(float32x4_t __a) { +__funline poly16x8_t vreinterpretq_p16_f32(float32x4_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_u8(uint8x16_t __a) { +__funline poly16x8_t vreinterpretq_p16_u8(uint8x16_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_u16(uint16x8_t __a) { +__funline poly16x8_t vreinterpretq_p16_u16(uint16x8_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_u32(uint32x4_t __a) { +__funline poly16x8_t vreinterpretq_p16_u32(uint32x4_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_u64(uint64x2_t __a) { +__funline poly16x8_t vreinterpretq_p16_u64(uint64x2_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_p8(poly8x16_t __a) { +__funline poly16x8_t vreinterpretq_p16_p8(poly8x16_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_p64(poly64x2_t __a) { +__funline poly16x8_t vreinterpretq_p16_p64(poly64x2_t __a) { return (poly16x8_t)__a; } -FUNK poly16x8_t vreinterpretq_p16_p128(poly128_t __a) { +__funline poly16x8_t vreinterpretq_p16_p128(poly128_t __a) { return (poly16x8_t)__a; } -FUNK poly64x1_t vreinterpret_p64_f16(float16x4_t __a) { +__funline poly64x1_t vreinterpret_p64_f16(float16x4_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_f64(float64x1_t __a) { +__funline poly64x1_t vreinterpret_p64_f64(float64x1_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_s8(int8x8_t __a) { +__funline poly64x1_t vreinterpret_p64_s8(int8x8_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_s16(int16x4_t __a) { +__funline poly64x1_t vreinterpret_p64_s16(int16x4_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_s32(int32x2_t __a) { +__funline poly64x1_t vreinterpret_p64_s32(int32x2_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_s64(int64x1_t __a) { +__funline poly64x1_t vreinterpret_p64_s64(int64x1_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_f32(float32x2_t __a) { +__funline poly64x1_t vreinterpret_p64_f32(float32x2_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_u8(uint8x8_t __a) { +__funline poly64x1_t vreinterpret_p64_u8(uint8x8_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_u16(uint16x4_t __a) { +__funline poly64x1_t vreinterpret_p64_u16(uint16x4_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_u32(uint32x2_t __a) { +__funline poly64x1_t vreinterpret_p64_u32(uint32x2_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_u64(uint64x1_t __a) { +__funline poly64x1_t vreinterpret_p64_u64(uint64x1_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_p8(poly8x8_t __a) { +__funline poly64x1_t vreinterpret_p64_p8(poly8x8_t __a) { return (poly64x1_t)__a; } -FUNK poly64x1_t vreinterpret_p64_p16(poly16x4_t __a) { +__funline poly64x1_t vreinterpret_p64_p16(poly16x4_t __a) { return (poly64x1_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_f64(float64x2_t __a) { +__funline poly64x2_t vreinterpretq_p64_f64(float64x2_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_s8(int8x16_t __a) { +__funline poly64x2_t vreinterpretq_p64_s8(int8x16_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_s16(int16x8_t __a) { +__funline poly64x2_t vreinterpretq_p64_s16(int16x8_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_s32(int32x4_t __a) { +__funline poly64x2_t vreinterpretq_p64_s32(int32x4_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_s64(int64x2_t __a) { +__funline poly64x2_t vreinterpretq_p64_s64(int64x2_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_f16(float16x8_t __a) { +__funline poly64x2_t vreinterpretq_p64_f16(float16x8_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_f32(float32x4_t __a) { +__funline poly64x2_t vreinterpretq_p64_f32(float32x4_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_p128(poly128_t __a) { +__funline poly64x2_t vreinterpretq_p64_p128(poly128_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_u8(uint8x16_t __a) { +__funline poly64x2_t vreinterpretq_p64_u8(uint8x16_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_u16(uint16x8_t __a) { +__funline poly64x2_t vreinterpretq_p64_u16(uint16x8_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_p16(poly16x8_t __a) { +__funline poly64x2_t vreinterpretq_p64_p16(poly16x8_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_u32(uint32x4_t __a) { +__funline poly64x2_t vreinterpretq_p64_u32(uint32x4_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_u64(uint64x2_t __a) { +__funline poly64x2_t vreinterpretq_p64_u64(uint64x2_t __a) { return (poly64x2_t)__a; } -FUNK poly64x2_t vreinterpretq_p64_p8(poly8x16_t __a) { +__funline poly64x2_t vreinterpretq_p64_p8(poly8x16_t __a) { return (poly64x2_t)__a; } -FUNK poly128_t vreinterpretq_p128_p8(poly8x16_t __a) { +__funline poly128_t vreinterpretq_p128_p8(poly8x16_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_p16(poly16x8_t __a) { +__funline poly128_t vreinterpretq_p128_p16(poly16x8_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_f16(float16x8_t __a) { +__funline poly128_t vreinterpretq_p128_f16(float16x8_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_f32(float32x4_t __a) { +__funline poly128_t vreinterpretq_p128_f32(float32x4_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_p64(poly64x2_t __a) { +__funline poly128_t vreinterpretq_p128_p64(poly64x2_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_s64(int64x2_t __a) { +__funline poly128_t vreinterpretq_p128_s64(int64x2_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_u64(uint64x2_t __a) { +__funline poly128_t vreinterpretq_p128_u64(uint64x2_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_s8(int8x16_t __a) { +__funline poly128_t vreinterpretq_p128_s8(int8x16_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_s16(int16x8_t __a) { +__funline poly128_t vreinterpretq_p128_s16(int16x8_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_s32(int32x4_t __a) { +__funline poly128_t vreinterpretq_p128_s32(int32x4_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_u8(uint8x16_t __a) { +__funline poly128_t vreinterpretq_p128_u8(uint8x16_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_u16(uint16x8_t __a) { +__funline poly128_t vreinterpretq_p128_u16(uint16x8_t __a) { return (poly128_t)__a; } -FUNK poly128_t vreinterpretq_p128_u32(uint32x4_t __a) { +__funline poly128_t vreinterpretq_p128_u32(uint32x4_t __a) { return (poly128_t)__a; } -FUNK float16x4_t vreinterpret_f16_f64(float64x1_t __a) { +__funline float16x4_t vreinterpret_f16_f64(float64x1_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_s8(int8x8_t __a) { +__funline float16x4_t vreinterpret_f16_s8(int8x8_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_s16(int16x4_t __a) { +__funline float16x4_t vreinterpret_f16_s16(int16x4_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_s32(int32x2_t __a) { +__funline float16x4_t vreinterpret_f16_s32(int32x2_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_s64(int64x1_t __a) { +__funline float16x4_t vreinterpret_f16_s64(int64x1_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_f32(float32x2_t __a) { +__funline float16x4_t vreinterpret_f16_f32(float32x2_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_u8(uint8x8_t __a) { +__funline float16x4_t vreinterpret_f16_u8(uint8x8_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_u16(uint16x4_t __a) { +__funline float16x4_t vreinterpret_f16_u16(uint16x4_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_u32(uint32x2_t __a) { +__funline float16x4_t vreinterpret_f16_u32(uint32x2_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_u64(uint64x1_t __a) { +__funline float16x4_t vreinterpret_f16_u64(uint64x1_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_p8(poly8x8_t __a) { +__funline float16x4_t vreinterpret_f16_p8(poly8x8_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_p16(poly16x4_t __a) { +__funline float16x4_t vreinterpret_f16_p16(poly16x4_t __a) { return (float16x4_t)__a; } -FUNK float16x4_t vreinterpret_f16_p64(poly64x1_t __a) { +__funline float16x4_t vreinterpret_f16_p64(poly64x1_t __a) { return (float16x4_t)__a; } -FUNK float16x8_t vreinterpretq_f16_f64(float64x2_t __a) { +__funline float16x8_t vreinterpretq_f16_f64(float64x2_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_s8(int8x16_t __a) { +__funline float16x8_t vreinterpretq_f16_s8(int8x16_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_s16(int16x8_t __a) { +__funline float16x8_t vreinterpretq_f16_s16(int16x8_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_s32(int32x4_t __a) { +__funline float16x8_t vreinterpretq_f16_s32(int32x4_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_s64(int64x2_t __a) { +__funline float16x8_t vreinterpretq_f16_s64(int64x2_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_f32(float32x4_t __a) { +__funline float16x8_t vreinterpretq_f16_f32(float32x4_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_u8(uint8x16_t __a) { +__funline float16x8_t vreinterpretq_f16_u8(uint8x16_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_u16(uint16x8_t __a) { +__funline float16x8_t vreinterpretq_f16_u16(uint16x8_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_u32(uint32x4_t __a) { +__funline float16x8_t vreinterpretq_f16_u32(uint32x4_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_u64(uint64x2_t __a) { +__funline float16x8_t vreinterpretq_f16_u64(uint64x2_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_p8(poly8x16_t __a) { +__funline float16x8_t vreinterpretq_f16_p8(poly8x16_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_p128(poly128_t __a) { +__funline float16x8_t vreinterpretq_f16_p128(poly128_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_p16(poly16x8_t __a) { +__funline float16x8_t vreinterpretq_f16_p16(poly16x8_t __a) { return (float16x8_t)__a; } -FUNK float16x8_t vreinterpretq_f16_p64(poly64x2_t __a) { +__funline float16x8_t vreinterpretq_f16_p64(poly64x2_t __a) { return (float16x8_t)__a; } -FUNK float32x2_t vreinterpret_f32_f16(float16x4_t __a) { +__funline float32x2_t vreinterpret_f32_f16(float16x4_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_f64(float64x1_t __a) { +__funline float32x2_t vreinterpret_f32_f64(float64x1_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_s8(int8x8_t __a) { +__funline float32x2_t vreinterpret_f32_s8(int8x8_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_s16(int16x4_t __a) { +__funline float32x2_t vreinterpret_f32_s16(int16x4_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_s32(int32x2_t __a) { +__funline float32x2_t vreinterpret_f32_s32(int32x2_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_s64(int64x1_t __a) { +__funline float32x2_t vreinterpret_f32_s64(int64x1_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_u8(uint8x8_t __a) { +__funline float32x2_t vreinterpret_f32_u8(uint8x8_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_u16(uint16x4_t __a) { +__funline float32x2_t vreinterpret_f32_u16(uint16x4_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_u32(uint32x2_t __a) { +__funline float32x2_t vreinterpret_f32_u32(uint32x2_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_u64(uint64x1_t __a) { +__funline float32x2_t vreinterpret_f32_u64(uint64x1_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_p8(poly8x8_t __a) { +__funline float32x2_t vreinterpret_f32_p8(poly8x8_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_p16(poly16x4_t __a) { +__funline float32x2_t vreinterpret_f32_p16(poly16x4_t __a) { return (float32x2_t)__a; } -FUNK float32x2_t vreinterpret_f32_p64(poly64x1_t __a) { +__funline float32x2_t vreinterpret_f32_p64(poly64x1_t __a) { return (float32x2_t)__a; } -FUNK float32x4_t vreinterpretq_f32_f16(float16x8_t __a) { +__funline float32x4_t vreinterpretq_f32_f16(float16x8_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_f64(float64x2_t __a) { +__funline float32x4_t vreinterpretq_f32_f64(float64x2_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_s8(int8x16_t __a) { +__funline float32x4_t vreinterpretq_f32_s8(int8x16_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_s16(int16x8_t __a) { +__funline float32x4_t vreinterpretq_f32_s16(int16x8_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_s32(int32x4_t __a) { +__funline float32x4_t vreinterpretq_f32_s32(int32x4_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_s64(int64x2_t __a) { +__funline float32x4_t vreinterpretq_f32_s64(int64x2_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_u8(uint8x16_t __a) { +__funline float32x4_t vreinterpretq_f32_u8(uint8x16_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_u16(uint16x8_t __a) { +__funline float32x4_t vreinterpretq_f32_u16(uint16x8_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_u32(uint32x4_t __a) { +__funline float32x4_t vreinterpretq_f32_u32(uint32x4_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_u64(uint64x2_t __a) { +__funline float32x4_t vreinterpretq_f32_u64(uint64x2_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_p8(poly8x16_t __a) { +__funline float32x4_t vreinterpretq_f32_p8(poly8x16_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_p16(poly16x8_t __a) { +__funline float32x4_t vreinterpretq_f32_p16(poly16x8_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_p64(poly64x2_t __a) { +__funline float32x4_t vreinterpretq_f32_p64(poly64x2_t __a) { return (float32x4_t)__a; } -FUNK float32x4_t vreinterpretq_f32_p128(poly128_t __a) { +__funline float32x4_t vreinterpretq_f32_p128(poly128_t __a) { return (float32x4_t)__a; } -FUNK float64x1_t vreinterpret_f64_f16(float16x4_t __a) { +__funline float64x1_t vreinterpret_f64_f16(float16x4_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_f32(float32x2_t __a) { +__funline float64x1_t vreinterpret_f64_f32(float32x2_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_p8(poly8x8_t __a) { +__funline float64x1_t vreinterpret_f64_p8(poly8x8_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_p16(poly16x4_t __a) { +__funline float64x1_t vreinterpret_f64_p16(poly16x4_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_p64(poly64x1_t __a) { +__funline float64x1_t vreinterpret_f64_p64(poly64x1_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_s8(int8x8_t __a) { +__funline float64x1_t vreinterpret_f64_s8(int8x8_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_s16(int16x4_t __a) { +__funline float64x1_t vreinterpret_f64_s16(int16x4_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_s32(int32x2_t __a) { +__funline float64x1_t vreinterpret_f64_s32(int32x2_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_s64(int64x1_t __a) { +__funline float64x1_t vreinterpret_f64_s64(int64x1_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_u8(uint8x8_t __a) { +__funline float64x1_t vreinterpret_f64_u8(uint8x8_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_u16(uint16x4_t __a) { +__funline float64x1_t vreinterpret_f64_u16(uint16x4_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_u32(uint32x2_t __a) { +__funline float64x1_t vreinterpret_f64_u32(uint32x2_t __a) { return (float64x1_t)__a; } -FUNK float64x1_t vreinterpret_f64_u64(uint64x1_t __a) { +__funline float64x1_t vreinterpret_f64_u64(uint64x1_t __a) { return (float64x1_t)__a; } -FUNK float64x2_t vreinterpretq_f64_f16(float16x8_t __a) { +__funline float64x2_t vreinterpretq_f64_f16(float16x8_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_f32(float32x4_t __a) { +__funline float64x2_t vreinterpretq_f64_f32(float32x4_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_p8(poly8x16_t __a) { +__funline float64x2_t vreinterpretq_f64_p8(poly8x16_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_p16(poly16x8_t __a) { +__funline float64x2_t vreinterpretq_f64_p16(poly16x8_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_p64(poly64x2_t __a) { +__funline float64x2_t vreinterpretq_f64_p64(poly64x2_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_s8(int8x16_t __a) { +__funline float64x2_t vreinterpretq_f64_s8(int8x16_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_s16(int16x8_t __a) { +__funline float64x2_t vreinterpretq_f64_s16(int16x8_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_s32(int32x4_t __a) { +__funline float64x2_t vreinterpretq_f64_s32(int32x4_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_s64(int64x2_t __a) { +__funline float64x2_t vreinterpretq_f64_s64(int64x2_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_u8(uint8x16_t __a) { +__funline float64x2_t vreinterpretq_f64_u8(uint8x16_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_u16(uint16x8_t __a) { +__funline float64x2_t vreinterpretq_f64_u16(uint16x8_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_u32(uint32x4_t __a) { +__funline float64x2_t vreinterpretq_f64_u32(uint32x4_t __a) { return (float64x2_t)__a; } -FUNK float64x2_t vreinterpretq_f64_u64(uint64x2_t __a) { +__funline float64x2_t vreinterpretq_f64_u64(uint64x2_t __a) { return (float64x2_t)__a; } -FUNK int64x1_t vreinterpret_s64_f16(float16x4_t __a) { +__funline int64x1_t vreinterpret_s64_f16(float16x4_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_f64(float64x1_t __a) { +__funline int64x1_t vreinterpret_s64_f64(float64x1_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_s8(int8x8_t __a) { +__funline int64x1_t vreinterpret_s64_s8(int8x8_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_s16(int16x4_t __a) { +__funline int64x1_t vreinterpret_s64_s16(int16x4_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_s32(int32x2_t __a) { +__funline int64x1_t vreinterpret_s64_s32(int32x2_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_f32(float32x2_t __a) { +__funline int64x1_t vreinterpret_s64_f32(float32x2_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_u8(uint8x8_t __a) { +__funline int64x1_t vreinterpret_s64_u8(uint8x8_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_u16(uint16x4_t __a) { +__funline int64x1_t vreinterpret_s64_u16(uint16x4_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_u32(uint32x2_t __a) { +__funline int64x1_t vreinterpret_s64_u32(uint32x2_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_u64(uint64x1_t __a) { +__funline int64x1_t vreinterpret_s64_u64(uint64x1_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_p8(poly8x8_t __a) { +__funline int64x1_t vreinterpret_s64_p8(poly8x8_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_p16(poly16x4_t __a) { +__funline int64x1_t vreinterpret_s64_p16(poly16x4_t __a) { return (int64x1_t)__a; } -FUNK int64x1_t vreinterpret_s64_p64(poly64x1_t __a) { +__funline int64x1_t vreinterpret_s64_p64(poly64x1_t __a) { return (int64x1_t)__a; } -FUNK int64x2_t vreinterpretq_s64_f64(float64x2_t __a) { +__funline int64x2_t vreinterpretq_s64_f64(float64x2_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_s8(int8x16_t __a) { +__funline int64x2_t vreinterpretq_s64_s8(int8x16_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_s16(int16x8_t __a) { +__funline int64x2_t vreinterpretq_s64_s16(int16x8_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_s32(int32x4_t __a) { +__funline int64x2_t vreinterpretq_s64_s32(int32x4_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_f16(float16x8_t __a) { +__funline int64x2_t vreinterpretq_s64_f16(float16x8_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_f32(float32x4_t __a) { +__funline int64x2_t vreinterpretq_s64_f32(float32x4_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_u8(uint8x16_t __a) { +__funline int64x2_t vreinterpretq_s64_u8(uint8x16_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_u16(uint16x8_t __a) { +__funline int64x2_t vreinterpretq_s64_u16(uint16x8_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_u32(uint32x4_t __a) { +__funline int64x2_t vreinterpretq_s64_u32(uint32x4_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_u64(uint64x2_t __a) { +__funline int64x2_t vreinterpretq_s64_u64(uint64x2_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_p8(poly8x16_t __a) { +__funline int64x2_t vreinterpretq_s64_p8(poly8x16_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_p16(poly16x8_t __a) { +__funline int64x2_t vreinterpretq_s64_p16(poly16x8_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_p64(poly64x2_t __a) { +__funline int64x2_t vreinterpretq_s64_p64(poly64x2_t __a) { return (int64x2_t)__a; } -FUNK int64x2_t vreinterpretq_s64_p128(poly128_t __a) { +__funline int64x2_t vreinterpretq_s64_p128(poly128_t __a) { return (int64x2_t)__a; } -FUNK uint64x1_t vreinterpret_u64_f16(float16x4_t __a) { +__funline uint64x1_t vreinterpret_u64_f16(float16x4_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_f64(float64x1_t __a) { +__funline uint64x1_t vreinterpret_u64_f64(float64x1_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_s8(int8x8_t __a) { +__funline uint64x1_t vreinterpret_u64_s8(int8x8_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_s16(int16x4_t __a) { +__funline uint64x1_t vreinterpret_u64_s16(int16x4_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_s32(int32x2_t __a) { +__funline uint64x1_t vreinterpret_u64_s32(int32x2_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_s64(int64x1_t __a) { +__funline uint64x1_t vreinterpret_u64_s64(int64x1_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_f32(float32x2_t __a) { +__funline uint64x1_t vreinterpret_u64_f32(float32x2_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_u8(uint8x8_t __a) { +__funline uint64x1_t vreinterpret_u64_u8(uint8x8_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_u16(uint16x4_t __a) { +__funline uint64x1_t vreinterpret_u64_u16(uint16x4_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_u32(uint32x2_t __a) { +__funline uint64x1_t vreinterpret_u64_u32(uint32x2_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_p8(poly8x8_t __a) { +__funline uint64x1_t vreinterpret_u64_p8(poly8x8_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_p16(poly16x4_t __a) { +__funline uint64x1_t vreinterpret_u64_p16(poly16x4_t __a) { return (uint64x1_t)__a; } -FUNK uint64x1_t vreinterpret_u64_p64(poly64x1_t __a) { +__funline uint64x1_t vreinterpret_u64_p64(poly64x1_t __a) { return (uint64x1_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_f64(float64x2_t __a) { +__funline uint64x2_t vreinterpretq_u64_f64(float64x2_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_s8(int8x16_t __a) { +__funline uint64x2_t vreinterpretq_u64_s8(int8x16_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_s16(int16x8_t __a) { +__funline uint64x2_t vreinterpretq_u64_s16(int16x8_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_s32(int32x4_t __a) { +__funline uint64x2_t vreinterpretq_u64_s32(int32x4_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_s64(int64x2_t __a) { +__funline uint64x2_t vreinterpretq_u64_s64(int64x2_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_f16(float16x8_t __a) { +__funline uint64x2_t vreinterpretq_u64_f16(float16x8_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_f32(float32x4_t __a) { +__funline uint64x2_t vreinterpretq_u64_f32(float32x4_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_u8(uint8x16_t __a) { +__funline uint64x2_t vreinterpretq_u64_u8(uint8x16_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_u16(uint16x8_t __a) { +__funline uint64x2_t vreinterpretq_u64_u16(uint16x8_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_u32(uint32x4_t __a) { +__funline uint64x2_t vreinterpretq_u64_u32(uint32x4_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_p8(poly8x16_t __a) { +__funline uint64x2_t vreinterpretq_u64_p8(poly8x16_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_p16(poly16x8_t __a) { +__funline uint64x2_t vreinterpretq_u64_p16(poly16x8_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_p64(poly64x2_t __a) { +__funline uint64x2_t vreinterpretq_u64_p64(poly64x2_t __a) { return (uint64x2_t)__a; } -FUNK uint64x2_t vreinterpretq_u64_p128(poly128_t __a) { +__funline uint64x2_t vreinterpretq_u64_p128(poly128_t __a) { return (uint64x2_t)__a; } -FUNK int8x8_t vreinterpret_s8_f16(float16x4_t __a) { +__funline int8x8_t vreinterpret_s8_f16(float16x4_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_f64(float64x1_t __a) { +__funline int8x8_t vreinterpret_s8_f64(float64x1_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_s16(int16x4_t __a) { +__funline int8x8_t vreinterpret_s8_s16(int16x4_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_s32(int32x2_t __a) { +__funline int8x8_t vreinterpret_s8_s32(int32x2_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_s64(int64x1_t __a) { +__funline int8x8_t vreinterpret_s8_s64(int64x1_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_f32(float32x2_t __a) { +__funline int8x8_t vreinterpret_s8_f32(float32x2_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_u8(uint8x8_t __a) { +__funline int8x8_t vreinterpret_s8_u8(uint8x8_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_u16(uint16x4_t __a) { +__funline int8x8_t vreinterpret_s8_u16(uint16x4_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_u32(uint32x2_t __a) { +__funline int8x8_t vreinterpret_s8_u32(uint32x2_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_u64(uint64x1_t __a) { +__funline int8x8_t vreinterpret_s8_u64(uint64x1_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_p8(poly8x8_t __a) { +__funline int8x8_t vreinterpret_s8_p8(poly8x8_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_p16(poly16x4_t __a) { +__funline int8x8_t vreinterpret_s8_p16(poly16x4_t __a) { return (int8x8_t)__a; } -FUNK int8x8_t vreinterpret_s8_p64(poly64x1_t __a) { +__funline int8x8_t vreinterpret_s8_p64(poly64x1_t __a) { return (int8x8_t)__a; } -FUNK int8x16_t vreinterpretq_s8_f64(float64x2_t __a) { +__funline int8x16_t vreinterpretq_s8_f64(float64x2_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_s16(int16x8_t __a) { +__funline int8x16_t vreinterpretq_s8_s16(int16x8_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_s32(int32x4_t __a) { +__funline int8x16_t vreinterpretq_s8_s32(int32x4_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_s64(int64x2_t __a) { +__funline int8x16_t vreinterpretq_s8_s64(int64x2_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_f16(float16x8_t __a) { +__funline int8x16_t vreinterpretq_s8_f16(float16x8_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_f32(float32x4_t __a) { +__funline int8x16_t vreinterpretq_s8_f32(float32x4_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_u8(uint8x16_t __a) { +__funline int8x16_t vreinterpretq_s8_u8(uint8x16_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_u16(uint16x8_t __a) { +__funline int8x16_t vreinterpretq_s8_u16(uint16x8_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_u32(uint32x4_t __a) { +__funline int8x16_t vreinterpretq_s8_u32(uint32x4_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_u64(uint64x2_t __a) { +__funline int8x16_t vreinterpretq_s8_u64(uint64x2_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_p8(poly8x16_t __a) { +__funline int8x16_t vreinterpretq_s8_p8(poly8x16_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_p16(poly16x8_t __a) { +__funline int8x16_t vreinterpretq_s8_p16(poly16x8_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_p64(poly64x2_t __a) { +__funline int8x16_t vreinterpretq_s8_p64(poly64x2_t __a) { return (int8x16_t)__a; } -FUNK int8x16_t vreinterpretq_s8_p128(poly128_t __a) { +__funline int8x16_t vreinterpretq_s8_p128(poly128_t __a) { return (int8x16_t)__a; } -FUNK int16x4_t vreinterpret_s16_f16(float16x4_t __a) { +__funline int16x4_t vreinterpret_s16_f16(float16x4_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_f64(float64x1_t __a) { +__funline int16x4_t vreinterpret_s16_f64(float64x1_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_s8(int8x8_t __a) { +__funline int16x4_t vreinterpret_s16_s8(int8x8_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_s32(int32x2_t __a) { +__funline int16x4_t vreinterpret_s16_s32(int32x2_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_s64(int64x1_t __a) { +__funline int16x4_t vreinterpret_s16_s64(int64x1_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_f32(float32x2_t __a) { +__funline int16x4_t vreinterpret_s16_f32(float32x2_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_u8(uint8x8_t __a) { +__funline int16x4_t vreinterpret_s16_u8(uint8x8_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_u16(uint16x4_t __a) { +__funline int16x4_t vreinterpret_s16_u16(uint16x4_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_u32(uint32x2_t __a) { +__funline int16x4_t vreinterpret_s16_u32(uint32x2_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_u64(uint64x1_t __a) { +__funline int16x4_t vreinterpret_s16_u64(uint64x1_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_p8(poly8x8_t __a) { +__funline int16x4_t vreinterpret_s16_p8(poly8x8_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_p16(poly16x4_t __a) { +__funline int16x4_t vreinterpret_s16_p16(poly16x4_t __a) { return (int16x4_t)__a; } -FUNK int16x4_t vreinterpret_s16_p64(poly64x1_t __a) { +__funline int16x4_t vreinterpret_s16_p64(poly64x1_t __a) { return (int16x4_t)__a; } -FUNK int16x8_t vreinterpretq_s16_f64(float64x2_t __a) { +__funline int16x8_t vreinterpretq_s16_f64(float64x2_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_s8(int8x16_t __a) { +__funline int16x8_t vreinterpretq_s16_s8(int8x16_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_s32(int32x4_t __a) { +__funline int16x8_t vreinterpretq_s16_s32(int32x4_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_s64(int64x2_t __a) { +__funline int16x8_t vreinterpretq_s16_s64(int64x2_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_f16(float16x8_t __a) { +__funline int16x8_t vreinterpretq_s16_f16(float16x8_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_f32(float32x4_t __a) { +__funline int16x8_t vreinterpretq_s16_f32(float32x4_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_u8(uint8x16_t __a) { +__funline int16x8_t vreinterpretq_s16_u8(uint8x16_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_u16(uint16x8_t __a) { +__funline int16x8_t vreinterpretq_s16_u16(uint16x8_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_u32(uint32x4_t __a) { +__funline int16x8_t vreinterpretq_s16_u32(uint32x4_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_u64(uint64x2_t __a) { +__funline int16x8_t vreinterpretq_s16_u64(uint64x2_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_p8(poly8x16_t __a) { +__funline int16x8_t vreinterpretq_s16_p8(poly8x16_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_p16(poly16x8_t __a) { +__funline int16x8_t vreinterpretq_s16_p16(poly16x8_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_p64(poly64x2_t __a) { +__funline int16x8_t vreinterpretq_s16_p64(poly64x2_t __a) { return (int16x8_t)__a; } -FUNK int16x8_t vreinterpretq_s16_p128(poly128_t __a) { +__funline int16x8_t vreinterpretq_s16_p128(poly128_t __a) { return (int16x8_t)__a; } -FUNK int32x2_t vreinterpret_s32_f16(float16x4_t __a) { +__funline int32x2_t vreinterpret_s32_f16(float16x4_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_f64(float64x1_t __a) { +__funline int32x2_t vreinterpret_s32_f64(float64x1_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_s8(int8x8_t __a) { +__funline int32x2_t vreinterpret_s32_s8(int8x8_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_s16(int16x4_t __a) { +__funline int32x2_t vreinterpret_s32_s16(int16x4_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_s64(int64x1_t __a) { +__funline int32x2_t vreinterpret_s32_s64(int64x1_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_f32(float32x2_t __a) { +__funline int32x2_t vreinterpret_s32_f32(float32x2_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_u8(uint8x8_t __a) { +__funline int32x2_t vreinterpret_s32_u8(uint8x8_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_u16(uint16x4_t __a) { +__funline int32x2_t vreinterpret_s32_u16(uint16x4_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_u32(uint32x2_t __a) { +__funline int32x2_t vreinterpret_s32_u32(uint32x2_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_u64(uint64x1_t __a) { +__funline int32x2_t vreinterpret_s32_u64(uint64x1_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_p8(poly8x8_t __a) { +__funline int32x2_t vreinterpret_s32_p8(poly8x8_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_p16(poly16x4_t __a) { +__funline int32x2_t vreinterpret_s32_p16(poly16x4_t __a) { return (int32x2_t)__a; } -FUNK int32x2_t vreinterpret_s32_p64(poly64x1_t __a) { +__funline int32x2_t vreinterpret_s32_p64(poly64x1_t __a) { return (int32x2_t)__a; } -FUNK int32x4_t vreinterpretq_s32_f64(float64x2_t __a) { +__funline int32x4_t vreinterpretq_s32_f64(float64x2_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_s8(int8x16_t __a) { +__funline int32x4_t vreinterpretq_s32_s8(int8x16_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_s16(int16x8_t __a) { +__funline int32x4_t vreinterpretq_s32_s16(int16x8_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_s64(int64x2_t __a) { +__funline int32x4_t vreinterpretq_s32_s64(int64x2_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_f16(float16x8_t __a) { +__funline int32x4_t vreinterpretq_s32_f16(float16x8_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_f32(float32x4_t __a) { +__funline int32x4_t vreinterpretq_s32_f32(float32x4_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_u8(uint8x16_t __a) { +__funline int32x4_t vreinterpretq_s32_u8(uint8x16_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_u16(uint16x8_t __a) { +__funline int32x4_t vreinterpretq_s32_u16(uint16x8_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_u32(uint32x4_t __a) { +__funline int32x4_t vreinterpretq_s32_u32(uint32x4_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_u64(uint64x2_t __a) { +__funline int32x4_t vreinterpretq_s32_u64(uint64x2_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_p8(poly8x16_t __a) { +__funline int32x4_t vreinterpretq_s32_p8(poly8x16_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_p16(poly16x8_t __a) { +__funline int32x4_t vreinterpretq_s32_p16(poly16x8_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_p64(poly64x2_t __a) { +__funline int32x4_t vreinterpretq_s32_p64(poly64x2_t __a) { return (int32x4_t)__a; } -FUNK int32x4_t vreinterpretq_s32_p128(poly128_t __a) { +__funline int32x4_t vreinterpretq_s32_p128(poly128_t __a) { return (int32x4_t)__a; } -FUNK uint8x8_t vreinterpret_u8_f16(float16x4_t __a) { +__funline uint8x8_t vreinterpret_u8_f16(float16x4_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_f64(float64x1_t __a) { +__funline uint8x8_t vreinterpret_u8_f64(float64x1_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_s8(int8x8_t __a) { +__funline uint8x8_t vreinterpret_u8_s8(int8x8_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_s16(int16x4_t __a) { +__funline uint8x8_t vreinterpret_u8_s16(int16x4_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_s32(int32x2_t __a) { +__funline uint8x8_t vreinterpret_u8_s32(int32x2_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_s64(int64x1_t __a) { +__funline uint8x8_t vreinterpret_u8_s64(int64x1_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_f32(float32x2_t __a) { +__funline uint8x8_t vreinterpret_u8_f32(float32x2_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_u16(uint16x4_t __a) { +__funline uint8x8_t vreinterpret_u8_u16(uint16x4_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_u32(uint32x2_t __a) { +__funline uint8x8_t vreinterpret_u8_u32(uint32x2_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_u64(uint64x1_t __a) { +__funline uint8x8_t vreinterpret_u8_u64(uint64x1_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_p8(poly8x8_t __a) { +__funline uint8x8_t vreinterpret_u8_p8(poly8x8_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_p16(poly16x4_t __a) { +__funline uint8x8_t vreinterpret_u8_p16(poly16x4_t __a) { return (uint8x8_t)__a; } -FUNK uint8x8_t vreinterpret_u8_p64(poly64x1_t __a) { +__funline uint8x8_t vreinterpret_u8_p64(poly64x1_t __a) { return (uint8x8_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_f64(float64x2_t __a) { +__funline uint8x16_t vreinterpretq_u8_f64(float64x2_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_s8(int8x16_t __a) { +__funline uint8x16_t vreinterpretq_u8_s8(int8x16_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_s16(int16x8_t __a) { +__funline uint8x16_t vreinterpretq_u8_s16(int16x8_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_s32(int32x4_t __a) { +__funline uint8x16_t vreinterpretq_u8_s32(int32x4_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_s64(int64x2_t __a) { +__funline uint8x16_t vreinterpretq_u8_s64(int64x2_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_f16(float16x8_t __a) { +__funline uint8x16_t vreinterpretq_u8_f16(float16x8_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_f32(float32x4_t __a) { +__funline uint8x16_t vreinterpretq_u8_f32(float32x4_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_u16(uint16x8_t __a) { +__funline uint8x16_t vreinterpretq_u8_u16(uint16x8_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_u32(uint32x4_t __a) { +__funline uint8x16_t vreinterpretq_u8_u32(uint32x4_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_u64(uint64x2_t __a) { +__funline uint8x16_t vreinterpretq_u8_u64(uint64x2_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_p8(poly8x16_t __a) { +__funline uint8x16_t vreinterpretq_u8_p8(poly8x16_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_p16(poly16x8_t __a) { +__funline uint8x16_t vreinterpretq_u8_p16(poly16x8_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_p64(poly64x2_t __a) { +__funline uint8x16_t vreinterpretq_u8_p64(poly64x2_t __a) { return (uint8x16_t)__a; } -FUNK uint8x16_t vreinterpretq_u8_p128(poly128_t __a) { +__funline uint8x16_t vreinterpretq_u8_p128(poly128_t __a) { return (uint8x16_t)__a; } -FUNK uint16x4_t vreinterpret_u16_f16(float16x4_t __a) { +__funline uint16x4_t vreinterpret_u16_f16(float16x4_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_f64(float64x1_t __a) { +__funline uint16x4_t vreinterpret_u16_f64(float64x1_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_s8(int8x8_t __a) { +__funline uint16x4_t vreinterpret_u16_s8(int8x8_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_s16(int16x4_t __a) { +__funline uint16x4_t vreinterpret_u16_s16(int16x4_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_s32(int32x2_t __a) { +__funline uint16x4_t vreinterpret_u16_s32(int32x2_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_s64(int64x1_t __a) { +__funline uint16x4_t vreinterpret_u16_s64(int64x1_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_f32(float32x2_t __a) { +__funline uint16x4_t vreinterpret_u16_f32(float32x2_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_u8(uint8x8_t __a) { +__funline uint16x4_t vreinterpret_u16_u8(uint8x8_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_u32(uint32x2_t __a) { +__funline uint16x4_t vreinterpret_u16_u32(uint32x2_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_u64(uint64x1_t __a) { +__funline uint16x4_t vreinterpret_u16_u64(uint64x1_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_p8(poly8x8_t __a) { +__funline uint16x4_t vreinterpret_u16_p8(poly8x8_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_p16(poly16x4_t __a) { +__funline uint16x4_t vreinterpret_u16_p16(poly16x4_t __a) { return (uint16x4_t)__a; } -FUNK uint16x4_t vreinterpret_u16_p64(poly64x1_t __a) { +__funline uint16x4_t vreinterpret_u16_p64(poly64x1_t __a) { return (uint16x4_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_f64(float64x2_t __a) { +__funline uint16x8_t vreinterpretq_u16_f64(float64x2_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_s8(int8x16_t __a) { +__funline uint16x8_t vreinterpretq_u16_s8(int8x16_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_s16(int16x8_t __a) { +__funline uint16x8_t vreinterpretq_u16_s16(int16x8_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_s32(int32x4_t __a) { +__funline uint16x8_t vreinterpretq_u16_s32(int32x4_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_s64(int64x2_t __a) { +__funline uint16x8_t vreinterpretq_u16_s64(int64x2_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_f16(float16x8_t __a) { +__funline uint16x8_t vreinterpretq_u16_f16(float16x8_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_f32(float32x4_t __a) { +__funline uint16x8_t vreinterpretq_u16_f32(float32x4_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_u8(uint8x16_t __a) { +__funline uint16x8_t vreinterpretq_u16_u8(uint8x16_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_u32(uint32x4_t __a) { +__funline uint16x8_t vreinterpretq_u16_u32(uint32x4_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_u64(uint64x2_t __a) { +__funline uint16x8_t vreinterpretq_u16_u64(uint64x2_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_p8(poly8x16_t __a) { +__funline uint16x8_t vreinterpretq_u16_p8(poly8x16_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_p16(poly16x8_t __a) { +__funline uint16x8_t vreinterpretq_u16_p16(poly16x8_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_p64(poly64x2_t __a) { +__funline uint16x8_t vreinterpretq_u16_p64(poly64x2_t __a) { return (uint16x8_t)__a; } -FUNK uint16x8_t vreinterpretq_u16_p128(poly128_t __a) { +__funline uint16x8_t vreinterpretq_u16_p128(poly128_t __a) { return (uint16x8_t)__a; } -FUNK uint32x2_t vreinterpret_u32_f16(float16x4_t __a) { +__funline uint32x2_t vreinterpret_u32_f16(float16x4_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_f64(float64x1_t __a) { +__funline uint32x2_t vreinterpret_u32_f64(float64x1_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_s8(int8x8_t __a) { +__funline uint32x2_t vreinterpret_u32_s8(int8x8_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_s16(int16x4_t __a) { +__funline uint32x2_t vreinterpret_u32_s16(int16x4_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_s32(int32x2_t __a) { +__funline uint32x2_t vreinterpret_u32_s32(int32x2_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_s64(int64x1_t __a) { +__funline uint32x2_t vreinterpret_u32_s64(int64x1_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_f32(float32x2_t __a) { +__funline uint32x2_t vreinterpret_u32_f32(float32x2_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_u8(uint8x8_t __a) { +__funline uint32x2_t vreinterpret_u32_u8(uint8x8_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_u16(uint16x4_t __a) { +__funline uint32x2_t vreinterpret_u32_u16(uint16x4_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_u64(uint64x1_t __a) { +__funline uint32x2_t vreinterpret_u32_u64(uint64x1_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_p8(poly8x8_t __a) { +__funline uint32x2_t vreinterpret_u32_p8(poly8x8_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_p16(poly16x4_t __a) { +__funline uint32x2_t vreinterpret_u32_p16(poly16x4_t __a) { return (uint32x2_t)__a; } -FUNK uint32x2_t vreinterpret_u32_p64(poly64x1_t __a) { +__funline uint32x2_t vreinterpret_u32_p64(poly64x1_t __a) { return (uint32x2_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_f64(float64x2_t __a) { +__funline uint32x4_t vreinterpretq_u32_f64(float64x2_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_s8(int8x16_t __a) { +__funline uint32x4_t vreinterpretq_u32_s8(int8x16_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_s16(int16x8_t __a) { +__funline uint32x4_t vreinterpretq_u32_s16(int16x8_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_s32(int32x4_t __a) { +__funline uint32x4_t vreinterpretq_u32_s32(int32x4_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_s64(int64x2_t __a) { +__funline uint32x4_t vreinterpretq_u32_s64(int64x2_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_f16(float16x8_t __a) { +__funline uint32x4_t vreinterpretq_u32_f16(float16x8_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_f32(float32x4_t __a) { +__funline uint32x4_t vreinterpretq_u32_f32(float32x4_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_u8(uint8x16_t __a) { +__funline uint32x4_t vreinterpretq_u32_u8(uint8x16_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_u16(uint16x8_t __a) { +__funline uint32x4_t vreinterpretq_u32_u16(uint16x8_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_u64(uint64x2_t __a) { +__funline uint32x4_t vreinterpretq_u32_u64(uint64x2_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_p8(poly8x16_t __a) { +__funline uint32x4_t vreinterpretq_u32_p8(poly8x16_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_p16(poly16x8_t __a) { +__funline uint32x4_t vreinterpretq_u32_p16(poly16x8_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_p64(poly64x2_t __a) { +__funline uint32x4_t vreinterpretq_u32_p64(poly64x2_t __a) { return (uint32x4_t)__a; } -FUNK uint32x4_t vreinterpretq_u32_p128(poly128_t __a) { +__funline uint32x4_t vreinterpretq_u32_p128(poly128_t __a) { return (uint32x4_t)__a; } -FUNK float16x4_t vset_lane_f16(float16_t __elem, float16x4_t __vec, - const int __index) { +__funline float16x4_t vset_lane_f16(float16_t __elem, float16x4_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK float32x2_t vset_lane_f32(float32_t __elem, float32x2_t __vec, - const int __index) { +__funline float32x2_t vset_lane_f32(float32_t __elem, float32x2_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK float64x1_t vset_lane_f64(float64_t __elem, float64x1_t __vec, - const int __index) { +__funline float64x1_t vset_lane_f64(float64_t __elem, float64x1_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK poly8x8_t vset_lane_p8(poly8_t __elem, poly8x8_t __vec, - const int __index) { +__funline poly8x8_t vset_lane_p8(poly8_t __elem, poly8x8_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK poly16x4_t vset_lane_p16(poly16_t __elem, poly16x4_t __vec, - const int __index) { +__funline poly16x4_t vset_lane_p16(poly16_t __elem, poly16x4_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK poly64x1_t vset_lane_p64(poly64_t __elem, poly64x1_t __vec, - const int __index) { +__funline poly64x1_t vset_lane_p64(poly64_t __elem, poly64x1_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK int8x8_t vset_lane_s8(int8_t __elem, int8x8_t __vec, const int __index) { - return __aarch64_vset_lane_any(__elem, __vec, __index); -} - -FUNK int16x4_t vset_lane_s16(int16_t __elem, int16x4_t __vec, - const int __index) { - return __aarch64_vset_lane_any(__elem, __vec, __index); -} - -FUNK int32x2_t vset_lane_s32(int32_t __elem, int32x2_t __vec, - const int __index) { - return __aarch64_vset_lane_any(__elem, __vec, __index); -} - -FUNK int64x1_t vset_lane_s64(int64_t __elem, int64x1_t __vec, - const int __index) { - return __aarch64_vset_lane_any(__elem, __vec, __index); -} - -FUNK uint8x8_t vset_lane_u8(uint8_t __elem, uint8x8_t __vec, - const int __index) { - return __aarch64_vset_lane_any(__elem, __vec, __index); -} - -FUNK uint16x4_t vset_lane_u16(uint16_t __elem, uint16x4_t __vec, - const int __index) { - return __aarch64_vset_lane_any(__elem, __vec, __index); -} - -FUNK uint32x2_t vset_lane_u32(uint32_t __elem, uint32x2_t __vec, - const int __index) { - return __aarch64_vset_lane_any(__elem, __vec, __index); -} - -FUNK uint64x1_t vset_lane_u64(uint64_t __elem, uint64x1_t __vec, - const int __index) { - return __aarch64_vset_lane_any(__elem, __vec, __index); -} - -FUNK float16x8_t vsetq_lane_f16(float16_t __elem, float16x8_t __vec, +__funline int8x8_t vset_lane_s8(int8_t __elem, int8x8_t __vec, const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK float32x4_t vsetq_lane_f32(float32_t __elem, float32x4_t __vec, - const int __index) { +__funline int16x4_t vset_lane_s16(int16_t __elem, int16x4_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK float64x2_t vsetq_lane_f64(float64_t __elem, float64x2_t __vec, - const int __index) { +__funline int32x2_t vset_lane_s32(int32_t __elem, int32x2_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK poly8x16_t vsetq_lane_p8(poly8_t __elem, poly8x16_t __vec, - const int __index) { +__funline int64x1_t vset_lane_s64(int64_t __elem, int64x1_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK poly16x8_t vsetq_lane_p16(poly16_t __elem, poly16x8_t __vec, - const int __index) { +__funline uint8x8_t vset_lane_u8(uint8_t __elem, uint8x8_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK poly64x2_t vsetq_lane_p64(poly64_t __elem, poly64x2_t __vec, - const int __index) { +__funline uint16x4_t vset_lane_u16(uint16_t __elem, uint16x4_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK int8x16_t vsetq_lane_s8(int8_t __elem, int8x16_t __vec, - const int __index) { +__funline uint32x2_t vset_lane_u32(uint32_t __elem, uint32x2_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK int16x8_t vsetq_lane_s16(int16_t __elem, int16x8_t __vec, - const int __index) { +__funline uint64x1_t vset_lane_u64(uint64_t __elem, uint64x1_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK int32x4_t vsetq_lane_s32(int32_t __elem, int32x4_t __vec, - const int __index) { +__funline float16x8_t vsetq_lane_f16(float16_t __elem, float16x8_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK int64x2_t vsetq_lane_s64(int64_t __elem, int64x2_t __vec, - const int __index) { +__funline float32x4_t vsetq_lane_f32(float32_t __elem, float32x4_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK uint8x16_t vsetq_lane_u8(uint8_t __elem, uint8x16_t __vec, - const int __index) { +__funline float64x2_t vsetq_lane_f64(float64_t __elem, float64x2_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK uint16x8_t vsetq_lane_u16(uint16_t __elem, uint16x8_t __vec, - const int __index) { +__funline poly8x16_t vsetq_lane_p8(poly8_t __elem, poly8x16_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK uint32x4_t vsetq_lane_u32(uint32_t __elem, uint32x4_t __vec, - const int __index) { +__funline poly16x8_t vsetq_lane_p16(poly16_t __elem, poly16x8_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } -FUNK uint64x2_t vsetq_lane_u64(uint64_t __elem, uint64x2_t __vec, - const int __index) { +__funline poly64x2_t vsetq_lane_p64(poly64_t __elem, poly64x2_t __vec, + const int __index) { + return __aarch64_vset_lane_any(__elem, __vec, __index); +} + +__funline int8x16_t vsetq_lane_s8(int8_t __elem, int8x16_t __vec, + const int __index) { + return __aarch64_vset_lane_any(__elem, __vec, __index); +} + +__funline int16x8_t vsetq_lane_s16(int16_t __elem, int16x8_t __vec, + const int __index) { + return __aarch64_vset_lane_any(__elem, __vec, __index); +} + +__funline int32x4_t vsetq_lane_s32(int32_t __elem, int32x4_t __vec, + const int __index) { + return __aarch64_vset_lane_any(__elem, __vec, __index); +} + +__funline int64x2_t vsetq_lane_s64(int64_t __elem, int64x2_t __vec, + const int __index) { + return __aarch64_vset_lane_any(__elem, __vec, __index); +} + +__funline uint8x16_t vsetq_lane_u8(uint8_t __elem, uint8x16_t __vec, + const int __index) { + return __aarch64_vset_lane_any(__elem, __vec, __index); +} + +__funline uint16x8_t vsetq_lane_u16(uint16_t __elem, uint16x8_t __vec, + const int __index) { + return __aarch64_vset_lane_any(__elem, __vec, __index); +} + +__funline uint32x4_t vsetq_lane_u32(uint32_t __elem, uint32x4_t __vec, + const int __index) { + return __aarch64_vset_lane_any(__elem, __vec, __index); +} + +__funline uint64x2_t vsetq_lane_u64(uint64_t __elem, uint64x2_t __vec, + const int __index) { return __aarch64_vset_lane_any(__elem, __vec, __index); } @@ -3784,59 +3795,59 @@ FUNK uint64x2_t vsetq_lane_u64(uint64_t __elem, uint64x2_t __vec, uint64x1_t lo = vcreate_u64(vgetq_lane_u64(tmp, 0)); \ return vreinterpret_##__TYPE##_u64(lo); -FUNK float16x4_t vget_low_f16(float16x8_t __a) { +__funline float16x4_t vget_low_f16(float16x8_t __a) { __GET_LOW(f16); } -FUNK float32x2_t vget_low_f32(float32x4_t __a) { +__funline float32x2_t vget_low_f32(float32x4_t __a) { __GET_LOW(f32); } -FUNK float64x1_t vget_low_f64(float64x2_t __a) { +__funline float64x1_t vget_low_f64(float64x2_t __a) { return (float64x1_t){vgetq_lane_f64(__a, 0)}; } -FUNK poly8x8_t vget_low_p8(poly8x16_t __a) { +__funline poly8x8_t vget_low_p8(poly8x16_t __a) { __GET_LOW(p8); } -FUNK poly16x4_t vget_low_p16(poly16x8_t __a) { +__funline poly16x4_t vget_low_p16(poly16x8_t __a) { __GET_LOW(p16); } -FUNK poly64x1_t vget_low_p64(poly64x2_t __a) { +__funline poly64x1_t vget_low_p64(poly64x2_t __a) { __GET_LOW(p64); } -FUNK int8x8_t vget_low_s8(int8x16_t __a) { +__funline int8x8_t vget_low_s8(int8x16_t __a) { __GET_LOW(s8); } -FUNK int16x4_t vget_low_s16(int16x8_t __a) { +__funline int16x4_t vget_low_s16(int16x8_t __a) { __GET_LOW(s16); } -FUNK int32x2_t vget_low_s32(int32x4_t __a) { +__funline int32x2_t vget_low_s32(int32x4_t __a) { __GET_LOW(s32); } -FUNK int64x1_t vget_low_s64(int64x2_t __a) { +__funline int64x1_t vget_low_s64(int64x2_t __a) { __GET_LOW(s64); } -FUNK uint8x8_t vget_low_u8(uint8x16_t __a) { +__funline uint8x8_t vget_low_u8(uint8x16_t __a) { __GET_LOW(u8); } -FUNK uint16x4_t vget_low_u16(uint16x8_t __a) { +__funline uint16x4_t vget_low_u16(uint16x8_t __a) { __GET_LOW(u16); } -FUNK uint32x2_t vget_low_u32(uint32x4_t __a) { +__funline uint32x2_t vget_low_u32(uint32x4_t __a) { __GET_LOW(u32); } -FUNK uint64x1_t vget_low_u64(uint64x2_t __a) { +__funline uint64x1_t vget_low_u64(uint64x2_t __a) { return vcreate_u64(vgetq_lane_u64(__a, 0)); } @@ -3847,126 +3858,126 @@ FUNK uint64x1_t vget_low_u64(uint64x2_t __a) { uint64x1_t hi = vcreate_u64(vgetq_lane_u64(tmp, 1)); \ return vreinterpret_##__TYPE##_u64(hi); -FUNK float16x4_t vget_high_f16(float16x8_t __a) { +__funline float16x4_t vget_high_f16(float16x8_t __a) { __GET_HIGH(f16); } -FUNK float32x2_t vget_high_f32(float32x4_t __a) { +__funline float32x2_t vget_high_f32(float32x4_t __a) { __GET_HIGH(f32); } -FUNK float64x1_t vget_high_f64(float64x2_t __a) { +__funline float64x1_t vget_high_f64(float64x2_t __a) { __GET_HIGH(f64); } -FUNK poly8x8_t vget_high_p8(poly8x16_t __a) { +__funline poly8x8_t vget_high_p8(poly8x16_t __a) { __GET_HIGH(p8); } -FUNK poly16x4_t vget_high_p16(poly16x8_t __a) { +__funline poly16x4_t vget_high_p16(poly16x8_t __a) { __GET_HIGH(p16); } -FUNK poly64x1_t vget_high_p64(poly64x2_t __a) { +__funline poly64x1_t vget_high_p64(poly64x2_t __a) { __GET_HIGH(p64); } -FUNK int8x8_t vget_high_s8(int8x16_t __a) { +__funline int8x8_t vget_high_s8(int8x16_t __a) { __GET_HIGH(s8); } -FUNK int16x4_t vget_high_s16(int16x8_t __a) { +__funline int16x4_t vget_high_s16(int16x8_t __a) { __GET_HIGH(s16); } -FUNK int32x2_t vget_high_s32(int32x4_t __a) { +__funline int32x2_t vget_high_s32(int32x4_t __a) { __GET_HIGH(s32); } -FUNK int64x1_t vget_high_s64(int64x2_t __a) { +__funline int64x1_t vget_high_s64(int64x2_t __a) { __GET_HIGH(s64); } -FUNK uint8x8_t vget_high_u8(uint8x16_t __a) { +__funline uint8x8_t vget_high_u8(uint8x16_t __a) { __GET_HIGH(u8); } -FUNK uint16x4_t vget_high_u16(uint16x8_t __a) { +__funline uint16x4_t vget_high_u16(uint16x8_t __a) { __GET_HIGH(u16); } -FUNK uint32x2_t vget_high_u32(uint32x4_t __a) { +__funline uint32x2_t vget_high_u32(uint32x4_t __a) { __GET_HIGH(u32); } #undef __GET_HIGH -FUNK uint64x1_t vget_high_u64(uint64x2_t __a) { +__funline uint64x1_t vget_high_u64(uint64x2_t __a) { return vcreate_u64(vgetq_lane_u64(__a, 1)); } -FUNK int8x16_t vcombine_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x16_t vcombine_s8(int8x8_t __a, int8x8_t __b) { return (int8x16_t)__builtin_aarch64_combinev8qi(__a, __b); } -FUNK int16x8_t vcombine_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x8_t vcombine_s16(int16x4_t __a, int16x4_t __b) { return (int16x8_t)__builtin_aarch64_combinev4hi(__a, __b); } -FUNK int32x4_t vcombine_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x4_t vcombine_s32(int32x2_t __a, int32x2_t __b) { return (int32x4_t)__builtin_aarch64_combinev2si(__a, __b); } -FUNK int64x2_t vcombine_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x2_t vcombine_s64(int64x1_t __a, int64x1_t __b) { return __builtin_aarch64_combinedi(__a[0], __b[0]); } -FUNK float16x8_t vcombine_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x8_t vcombine_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_combinev4hf(__a, __b); } -FUNK float32x4_t vcombine_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x4_t vcombine_f32(float32x2_t __a, float32x2_t __b) { return (float32x4_t)__builtin_aarch64_combinev2sf(__a, __b); } -FUNK uint8x16_t vcombine_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x16_t vcombine_u8(uint8x8_t __a, uint8x8_t __b) { return (uint8x16_t)__builtin_aarch64_combinev8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK uint16x8_t vcombine_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x8_t vcombine_u16(uint16x4_t __a, uint16x4_t __b) { return (uint16x8_t)__builtin_aarch64_combinev4hi((int16x4_t)__a, (int16x4_t)__b); } -FUNK uint32x4_t vcombine_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x4_t vcombine_u32(uint32x2_t __a, uint32x2_t __b) { return (uint32x4_t)__builtin_aarch64_combinev2si((int32x2_t)__a, (int32x2_t)__b); } -FUNK uint64x2_t vcombine_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x2_t vcombine_u64(uint64x1_t __a, uint64x1_t __b) { return (uint64x2_t)__builtin_aarch64_combinedi(__a[0], __b[0]); } -FUNK float64x2_t vcombine_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x2_t vcombine_f64(float64x1_t __a, float64x1_t __b) { return __builtin_aarch64_combinedf(__a[0], __b[0]); } -FUNK poly8x16_t vcombine_p8(poly8x8_t __a, poly8x8_t __b) { +__funline poly8x16_t vcombine_p8(poly8x8_t __a, poly8x8_t __b) { return (poly8x16_t)__builtin_aarch64_combinev8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK poly16x8_t vcombine_p16(poly16x4_t __a, poly16x4_t __b) { +__funline poly16x8_t vcombine_p16(poly16x4_t __a, poly16x4_t __b) { return (poly16x8_t)__builtin_aarch64_combinev4hi((int16x4_t)__a, (int16x4_t)__b); } -FUNK poly64x2_t vcombine_p64(poly64x1_t __a, poly64x1_t __b) { +__funline poly64x2_t vcombine_p64(poly64x1_t __a, poly64x1_t __b) { return (poly64x2_t)__builtin_aarch64_combinedi_ppp(__a[0], __b[0]); } -FUNK int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) { +__funline int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) { int8x8_t result; __asm__("saba %0.8b,%2.8b,%3.8b" : "=w"(result) @@ -3975,7 +3986,7 @@ FUNK int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return result; } -FUNK int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) { +__funline int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) { int16x4_t result; __asm__("saba %0.4h,%2.4h,%3.4h" : "=w"(result) @@ -3984,7 +3995,7 @@ FUNK int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return result; } -FUNK int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) { +__funline int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) { int32x2_t result; __asm__("saba %0.2s,%2.2s,%3.2s" : "=w"(result) @@ -3993,7 +4004,7 @@ FUNK int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return result; } -FUNK uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { +__funline uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { uint8x8_t result; __asm__("uaba %0.8b,%2.8b,%3.8b" : "=w"(result) @@ -4002,7 +4013,7 @@ FUNK uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return result; } -FUNK uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { +__funline uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { uint16x4_t result; __asm__("uaba %0.4h,%2.4h,%3.4h" : "=w"(result) @@ -4011,7 +4022,7 @@ FUNK uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return result; } -FUNK uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { +__funline uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { uint32x2_t result; __asm__("uaba %0.2s,%2.2s,%3.2s" : "=w"(result) @@ -4020,7 +4031,7 @@ FUNK uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return result; } -FUNK int16x8_t vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { +__funline int16x8_t vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { int16x8_t result; __asm__("sabal2 %0.8h,%2.16b,%3.16b" : "=w"(result) @@ -4029,7 +4040,7 @@ FUNK int16x8_t vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { return result; } -FUNK int32x4_t vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { +__funline int32x4_t vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { int32x4_t result; __asm__("sabal2 %0.4s,%2.8h,%3.8h" : "=w"(result) @@ -4038,7 +4049,7 @@ FUNK int32x4_t vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return result; } -FUNK int64x2_t vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { +__funline int64x2_t vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { int64x2_t result; __asm__("sabal2 %0.2d,%2.4s,%3.4s" : "=w"(result) @@ -4047,7 +4058,7 @@ FUNK int64x2_t vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return result; } -FUNK uint16x8_t vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { +__funline uint16x8_t vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { uint16x8_t result; __asm__("uabal2 %0.8h,%2.16b,%3.16b" : "=w"(result) @@ -4056,7 +4067,7 @@ FUNK uint16x8_t vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return result; } -FUNK uint32x4_t vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { +__funline uint32x4_t vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { uint32x4_t result; __asm__("uabal2 %0.4s,%2.8h,%3.8h" : "=w"(result) @@ -4065,7 +4076,7 @@ FUNK uint32x4_t vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return result; } -FUNK uint64x2_t vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { +__funline uint64x2_t vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { uint64x2_t result; __asm__("uabal2 %0.2d,%2.4s,%3.4s" : "=w"(result) @@ -4074,7 +4085,7 @@ FUNK uint64x2_t vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { return result; } -FUNK int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { +__funline int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { int16x8_t result; __asm__("sabal %0.8h,%2.8b,%3.8b" : "=w"(result) @@ -4083,7 +4094,7 @@ FUNK int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return result; } -FUNK int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { +__funline int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { int32x4_t result; __asm__("sabal %0.4s,%2.4h,%3.4h" : "=w"(result) @@ -4092,7 +4103,7 @@ FUNK int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return result; } -FUNK int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { +__funline int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { int64x2_t result; __asm__("sabal %0.2d,%2.2s,%3.2s" : "=w"(result) @@ -4101,7 +4112,7 @@ FUNK int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return result; } -FUNK uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { +__funline uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { uint16x8_t result; __asm__("uabal %0.8h,%2.8b,%3.8b" : "=w"(result) @@ -4110,7 +4121,7 @@ FUNK uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return result; } -FUNK uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { +__funline uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { uint32x4_t result; __asm__("uabal %0.4s,%2.4h,%3.4h" : "=w"(result) @@ -4119,7 +4130,7 @@ FUNK uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return result; } -FUNK uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { +__funline uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { uint64x2_t result; __asm__("uabal %0.2d,%2.2s,%3.2s" : "=w"(result) @@ -4128,7 +4139,7 @@ FUNK uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return result; } -FUNK int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { +__funline int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { int8x16_t result; __asm__("saba %0.16b,%2.16b,%3.16b" : "=w"(result) @@ -4137,7 +4148,7 @@ FUNK int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return result; } -FUNK int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { +__funline int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { int16x8_t result; __asm__("saba %0.8h,%2.8h,%3.8h" : "=w"(result) @@ -4146,7 +4157,7 @@ FUNK int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return result; } -FUNK int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { +__funline int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { int32x4_t result; __asm__("saba %0.4s,%2.4s,%3.4s" : "=w"(result) @@ -4155,7 +4166,7 @@ FUNK int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return result; } -FUNK uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { +__funline uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { uint8x16_t result; __asm__("uaba %0.16b,%2.16b,%3.16b" : "=w"(result) @@ -4164,7 +4175,7 @@ FUNK uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return result; } -FUNK uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { +__funline uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { uint16x8_t result; __asm__("uaba %0.8h,%2.8h,%3.8h" : "=w"(result) @@ -4173,7 +4184,7 @@ FUNK uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return result; } -FUNK uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { +__funline uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { uint32x4_t result; __asm__("uaba %0.4s,%2.4s,%3.4s" : "=w"(result) @@ -4182,7 +4193,7 @@ FUNK uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return result; } -FUNK int8x8_t vabd_s8(int8x8_t a, int8x8_t b) { +__funline int8x8_t vabd_s8(int8x8_t a, int8x8_t b) { int8x8_t result; __asm__("sabd %0.8b, %1.8b, %2.8b" : "=w"(result) @@ -4191,7 +4202,7 @@ FUNK int8x8_t vabd_s8(int8x8_t a, int8x8_t b) { return result; } -FUNK int16x4_t vabd_s16(int16x4_t a, int16x4_t b) { +__funline int16x4_t vabd_s16(int16x4_t a, int16x4_t b) { int16x4_t result; __asm__("sabd %0.4h, %1.4h, %2.4h" : "=w"(result) @@ -4200,7 +4211,7 @@ FUNK int16x4_t vabd_s16(int16x4_t a, int16x4_t b) { return result; } -FUNK int32x2_t vabd_s32(int32x2_t a, int32x2_t b) { +__funline int32x2_t vabd_s32(int32x2_t a, int32x2_t b) { int32x2_t result; __asm__("sabd %0.2s, %1.2s, %2.2s" : "=w"(result) @@ -4209,7 +4220,7 @@ FUNK int32x2_t vabd_s32(int32x2_t a, int32x2_t b) { return result; } -FUNK uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b) { +__funline uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b) { uint8x8_t result; __asm__("uabd %0.8b, %1.8b, %2.8b" : "=w"(result) @@ -4218,7 +4229,7 @@ FUNK uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b) { return result; } -FUNK uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b) { +__funline uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b) { uint16x4_t result; __asm__("uabd %0.4h, %1.4h, %2.4h" : "=w"(result) @@ -4227,7 +4238,7 @@ FUNK uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b) { return result; } -FUNK uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b) { +__funline uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b) { uint32x2_t result; __asm__("uabd %0.2s, %1.2s, %2.2s" : "=w"(result) @@ -4236,7 +4247,7 @@ FUNK uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b) { return result; } -FUNK int16x8_t vabdl_high_s8(int8x16_t a, int8x16_t b) { +__funline int16x8_t vabdl_high_s8(int8x16_t a, int8x16_t b) { int16x8_t result; __asm__("sabdl2 %0.8h,%1.16b,%2.16b" : "=w"(result) @@ -4245,7 +4256,7 @@ FUNK int16x8_t vabdl_high_s8(int8x16_t a, int8x16_t b) { return result; } -FUNK int32x4_t vabdl_high_s16(int16x8_t a, int16x8_t b) { +__funline int32x4_t vabdl_high_s16(int16x8_t a, int16x8_t b) { int32x4_t result; __asm__("sabdl2 %0.4s,%1.8h,%2.8h" : "=w"(result) @@ -4254,7 +4265,7 @@ FUNK int32x4_t vabdl_high_s16(int16x8_t a, int16x8_t b) { return result; } -FUNK int64x2_t vabdl_high_s32(int32x4_t a, int32x4_t b) { +__funline int64x2_t vabdl_high_s32(int32x4_t a, int32x4_t b) { int64x2_t result; __asm__("sabdl2 %0.2d,%1.4s,%2.4s" : "=w"(result) @@ -4263,7 +4274,7 @@ FUNK int64x2_t vabdl_high_s32(int32x4_t a, int32x4_t b) { return result; } -FUNK uint16x8_t vabdl_high_u8(uint8x16_t a, uint8x16_t b) { +__funline uint16x8_t vabdl_high_u8(uint8x16_t a, uint8x16_t b) { uint16x8_t result; __asm__("uabdl2 %0.8h,%1.16b,%2.16b" : "=w"(result) @@ -4272,7 +4283,7 @@ FUNK uint16x8_t vabdl_high_u8(uint8x16_t a, uint8x16_t b) { return result; } -FUNK uint32x4_t vabdl_high_u16(uint16x8_t a, uint16x8_t b) { +__funline uint32x4_t vabdl_high_u16(uint16x8_t a, uint16x8_t b) { uint32x4_t result; __asm__("uabdl2 %0.4s,%1.8h,%2.8h" : "=w"(result) @@ -4281,7 +4292,7 @@ FUNK uint32x4_t vabdl_high_u16(uint16x8_t a, uint16x8_t b) { return result; } -FUNK uint64x2_t vabdl_high_u32(uint32x4_t a, uint32x4_t b) { +__funline uint64x2_t vabdl_high_u32(uint32x4_t a, uint32x4_t b) { uint64x2_t result; __asm__("uabdl2 %0.2d,%1.4s,%2.4s" : "=w"(result) @@ -4290,7 +4301,7 @@ FUNK uint64x2_t vabdl_high_u32(uint32x4_t a, uint32x4_t b) { return result; } -FUNK int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) { +__funline int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) { int16x8_t result; __asm__("sabdl %0.8h, %1.8b, %2.8b" : "=w"(result) @@ -4299,7 +4310,7 @@ FUNK int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) { return result; } -FUNK int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) { +__funline int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) { int32x4_t result; __asm__("sabdl %0.4s, %1.4h, %2.4h" : "=w"(result) @@ -4308,7 +4319,7 @@ FUNK int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) { return result; } -FUNK int64x2_t vabdl_s32(int32x2_t a, int32x2_t b) { +__funline int64x2_t vabdl_s32(int32x2_t a, int32x2_t b) { int64x2_t result; __asm__("sabdl %0.2d, %1.2s, %2.2s" : "=w"(result) @@ -4317,7 +4328,7 @@ FUNK int64x2_t vabdl_s32(int32x2_t a, int32x2_t b) { return result; } -FUNK uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b) { +__funline uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b) { uint16x8_t result; __asm__("uabdl %0.8h, %1.8b, %2.8b" : "=w"(result) @@ -4326,7 +4337,7 @@ FUNK uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b) { return result; } -FUNK uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b) { +__funline uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b) { uint32x4_t result; __asm__("uabdl %0.4s, %1.4h, %2.4h" : "=w"(result) @@ -4335,7 +4346,7 @@ FUNK uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b) { return result; } -FUNK uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b) { +__funline uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b) { uint64x2_t result; __asm__("uabdl %0.2d, %1.2s, %2.2s" : "=w"(result) @@ -4344,7 +4355,7 @@ FUNK uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b) { return result; } -FUNK int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) { +__funline int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) { int8x16_t result; __asm__("sabd %0.16b, %1.16b, %2.16b" : "=w"(result) @@ -4353,7 +4364,7 @@ FUNK int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) { return result; } -FUNK int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) { +__funline int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) { int16x8_t result; __asm__("sabd %0.8h, %1.8h, %2.8h" : "=w"(result) @@ -4362,7 +4373,7 @@ FUNK int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) { return result; } -FUNK int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) { +__funline int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) { int32x4_t result; __asm__("sabd %0.4s, %1.4s, %2.4s" : "=w"(result) @@ -4371,7 +4382,7 @@ FUNK int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) { return result; } -FUNK uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) { +__funline uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) { uint8x16_t result; __asm__("uabd %0.16b, %1.16b, %2.16b" : "=w"(result) @@ -4380,7 +4391,7 @@ FUNK uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) { return result; } -FUNK uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b) { +__funline uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b) { uint16x8_t result; __asm__("uabd %0.8h, %1.8h, %2.8h" : "=w"(result) @@ -4389,7 +4400,7 @@ FUNK uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b) { return result; } -FUNK uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b) { +__funline uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b) { uint32x4_t result; __asm__("uabd %0.4s, %1.4s, %2.4s" : "=w"(result) @@ -4398,73 +4409,73 @@ FUNK uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b) { return result; } -FUNK int16_t vaddlv_s8(int8x8_t a) { +__funline int16_t vaddlv_s8(int8x8_t a) { int16_t result; __asm__("saddlv %h0,%1.8b" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int32_t vaddlv_s16(int16x4_t a) { +__funline int32_t vaddlv_s16(int16x4_t a) { int32_t result; __asm__("saddlv %s0,%1.4h" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint16_t vaddlv_u8(uint8x8_t a) { +__funline uint16_t vaddlv_u8(uint8x8_t a) { uint16_t result; __asm__("uaddlv %h0,%1.8b" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint32_t vaddlv_u16(uint16x4_t a) { +__funline uint32_t vaddlv_u16(uint16x4_t a) { uint32_t result; __asm__("uaddlv %s0,%1.4h" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int16_t vaddlvq_s8(int8x16_t a) { +__funline int16_t vaddlvq_s8(int8x16_t a) { int16_t result; __asm__("saddlv %h0,%1.16b" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int32_t vaddlvq_s16(int16x8_t a) { +__funline int32_t vaddlvq_s16(int16x8_t a) { int32_t result; __asm__("saddlv %s0,%1.8h" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int64_t vaddlvq_s32(int32x4_t a) { +__funline int64_t vaddlvq_s32(int32x4_t a) { int64_t result; __asm__("saddlv %d0,%1.4s" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint16_t vaddlvq_u8(uint8x16_t a) { +__funline uint16_t vaddlvq_u8(uint8x16_t a) { uint16_t result; __asm__("uaddlv %h0,%1.16b" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint32_t vaddlvq_u16(uint16x8_t a) { +__funline uint32_t vaddlvq_u16(uint16x8_t a) { uint32_t result; __asm__("uaddlv %s0,%1.8h" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint64_t vaddlvq_u32(uint32x4_t a) { +__funline uint64_t vaddlvq_u32(uint32x4_t a) { uint64_t result; __asm__("uaddlv %d0,%1.4s" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK float32x2_t vcvtx_f32_f64(float64x2_t a) { +__funline float32x2_t vcvtx_f32_f64(float64x2_t a) { float32x2_t result; __asm__("fcvtxn %0.2s,%1.2d" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK float32x4_t vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) { +__funline float32x4_t vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) { float32x4_t result; __asm__("fcvtxn2 %0.4s,%1.2d" : "=w"(result) @@ -4473,13 +4484,13 @@ FUNK float32x4_t vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) { return result; } -FUNK float32_t vcvtxd_f32_f64(float64_t a) { +__funline float32_t vcvtxd_f32_f64(float64_t a) { float32_t result; __asm__("fcvtxn %s0,%d1" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) { +__funline float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) { float32x2_t result; float32x2_t t1; __asm__("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s" @@ -4489,7 +4500,7 @@ FUNK float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) { return result; } -FUNK int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) { +__funline int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) { int16x4_t result; __asm__("mla %0.4h,%2.4h,%3.h[0]" : "=w"(result) @@ -4498,7 +4509,7 @@ FUNK int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) { return result; } -FUNK int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) { +__funline int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) { int32x2_t result; __asm__("mla %0.2s,%2.2s,%3.s[0]" : "=w"(result) @@ -4507,7 +4518,7 @@ FUNK int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) { return result; } -FUNK uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { +__funline uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { uint16x4_t result; __asm__("mla %0.4h,%2.4h,%3.h[0]" : "=w"(result) @@ -4516,7 +4527,7 @@ FUNK uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { return result; } -FUNK uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { +__funline uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { uint32x2_t result; __asm__("mla %0.2s,%2.2s,%3.s[0]" : "=w"(result) @@ -4525,7 +4536,7 @@ FUNK uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { return result; } -FUNK int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) { +__funline int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) { int8x8_t result; __asm__("mla %0.8b, %2.8b, %3.8b" : "=w"(result) @@ -4534,7 +4545,7 @@ FUNK int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return result; } -FUNK int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) { +__funline int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) { int16x4_t result; __asm__("mla %0.4h, %2.4h, %3.4h" : "=w"(result) @@ -4543,7 +4554,7 @@ FUNK int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return result; } -FUNK int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) { +__funline int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) { int32x2_t result; __asm__("mla %0.2s, %2.2s, %3.2s" : "=w"(result) @@ -4552,7 +4563,7 @@ FUNK int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return result; } -FUNK uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { +__funline uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { uint8x8_t result; __asm__("mla %0.8b, %2.8b, %3.8b" : "=w"(result) @@ -4561,7 +4572,7 @@ FUNK uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return result; } -FUNK uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { +__funline uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { uint16x4_t result; __asm__("mla %0.4h, %2.4h, %3.4h" : "=w"(result) @@ -4570,7 +4581,7 @@ FUNK uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return result; } -FUNK uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { +__funline uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { uint32x2_t result; __asm__("mla %0.2s, %2.2s, %3.2s" : "=w"(result) @@ -4683,7 +4694,7 @@ FUNK uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { result; \ }) -FUNK int32x4_t vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { +__funline int32x4_t vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { int32x4_t result; __asm__("smlal2 %0.4s,%2.8h,%3.h[0]" : "=w"(result) @@ -4692,7 +4703,7 @@ FUNK int32x4_t vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { return result; } -FUNK int64x2_t vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { +__funline int64x2_t vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { int64x2_t result; __asm__("smlal2 %0.2d,%2.4s,%3.s[0]" : "=w"(result) @@ -4701,7 +4712,7 @@ FUNK int64x2_t vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { return result; } -FUNK uint32x4_t vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { +__funline uint32x4_t vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { uint32x4_t result; __asm__("umlal2 %0.4s,%2.8h,%3.h[0]" : "=w"(result) @@ -4710,7 +4721,7 @@ FUNK uint32x4_t vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { return result; } -FUNK uint64x2_t vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { +__funline uint64x2_t vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { uint64x2_t result; __asm__("umlal2 %0.2d,%2.4s,%3.s[0]" : "=w"(result) @@ -4719,7 +4730,7 @@ FUNK uint64x2_t vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { return result; } -FUNK int16x8_t vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { +__funline int16x8_t vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { int16x8_t result; __asm__("smlal2 %0.8h,%2.16b,%3.16b" : "=w"(result) @@ -4728,7 +4739,7 @@ FUNK int16x8_t vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { return result; } -FUNK int32x4_t vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { +__funline int32x4_t vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { int32x4_t result; __asm__("smlal2 %0.4s,%2.8h,%3.8h" : "=w"(result) @@ -4737,7 +4748,7 @@ FUNK int32x4_t vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return result; } -FUNK int64x2_t vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { +__funline int64x2_t vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { int64x2_t result; __asm__("smlal2 %0.2d,%2.4s,%3.4s" : "=w"(result) @@ -4746,7 +4757,7 @@ FUNK int64x2_t vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return result; } -FUNK uint16x8_t vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { +__funline uint16x8_t vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { uint16x8_t result; __asm__("umlal2 %0.8h,%2.16b,%3.16b" : "=w"(result) @@ -4755,7 +4766,7 @@ FUNK uint16x8_t vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return result; } -FUNK uint32x4_t vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { +__funline uint32x4_t vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { uint32x4_t result; __asm__("umlal2 %0.4s,%2.8h,%3.8h" : "=w"(result) @@ -4764,7 +4775,7 @@ FUNK uint32x4_t vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return result; } -FUNK uint64x2_t vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { +__funline uint64x2_t vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { uint64x2_t result; __asm__("umlal2 %0.2d,%2.4s,%3.4s" : "=w"(result) @@ -4877,7 +4888,7 @@ FUNK uint64x2_t vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { result; \ }) -FUNK int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { +__funline int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { int32x4_t result; __asm__("smlal %0.4s,%2.4h,%3.h[0]" : "=w"(result) @@ -4886,7 +4897,7 @@ FUNK int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return result; } -FUNK int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { +__funline int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { int64x2_t result; __asm__("smlal %0.2d,%2.2s,%3.s[0]" : "=w"(result) @@ -4895,7 +4906,7 @@ FUNK int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return result; } -FUNK uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { +__funline uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { uint32x4_t result; __asm__("umlal %0.4s,%2.4h,%3.h[0]" : "=w"(result) @@ -4904,7 +4915,7 @@ FUNK uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { return result; } -FUNK uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { +__funline uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { uint64x2_t result; __asm__("umlal %0.2d,%2.2s,%3.s[0]" : "=w"(result) @@ -4913,7 +4924,7 @@ FUNK uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { return result; } -FUNK int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { +__funline int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { int16x8_t result; __asm__("smlal %0.8h,%2.8b,%3.8b" : "=w"(result) @@ -4922,7 +4933,7 @@ FUNK int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return result; } -FUNK int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { +__funline int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { int32x4_t result; __asm__("smlal %0.4s,%2.4h,%3.4h" : "=w"(result) @@ -4931,7 +4942,7 @@ FUNK int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return result; } -FUNK int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { +__funline int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { int64x2_t result; __asm__("smlal %0.2d,%2.2s,%3.2s" : "=w"(result) @@ -4940,7 +4951,7 @@ FUNK int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return result; } -FUNK uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { +__funline uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { uint16x8_t result; __asm__("umlal %0.8h,%2.8b,%3.8b" : "=w"(result) @@ -4949,7 +4960,7 @@ FUNK uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return result; } -FUNK uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { +__funline uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { uint32x4_t result; __asm__("umlal %0.4s,%2.4h,%3.4h" : "=w"(result) @@ -4958,7 +4969,7 @@ FUNK uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return result; } -FUNK uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { +__funline uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { uint64x2_t result; __asm__("umlal %0.2d,%2.2s,%3.2s" : "=w"(result) @@ -4967,7 +4978,7 @@ FUNK uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return result; } -FUNK float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { +__funline float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { float32x4_t result; float32x4_t t1; __asm__("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s" @@ -4977,7 +4988,7 @@ FUNK float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { return result; } -FUNK int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { +__funline int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { int16x8_t result; __asm__("mla %0.8h,%2.8h,%3.h[0]" : "=w"(result) @@ -4986,7 +4997,7 @@ FUNK int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { return result; } -FUNK int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { +__funline int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { int32x4_t result; __asm__("mla %0.4s,%2.4s,%3.s[0]" : "=w"(result) @@ -4995,7 +5006,7 @@ FUNK int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { return result; } -FUNK uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { +__funline uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { uint16x8_t result; __asm__("mla %0.8h,%2.8h,%3.h[0]" : "=w"(result) @@ -5004,7 +5015,7 @@ FUNK uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { return result; } -FUNK uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { +__funline uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { uint32x4_t result; __asm__("mla %0.4s,%2.4s,%3.s[0]" : "=w"(result) @@ -5013,7 +5024,7 @@ FUNK uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { return result; } -FUNK int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { +__funline int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { int8x16_t result; __asm__("mla %0.16b, %2.16b, %3.16b" : "=w"(result) @@ -5022,7 +5033,7 @@ FUNK int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return result; } -FUNK int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { +__funline int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { int16x8_t result; __asm__("mla %0.8h, %2.8h, %3.8h" : "=w"(result) @@ -5031,7 +5042,7 @@ FUNK int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return result; } -FUNK int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { +__funline int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { int32x4_t result; __asm__("mla %0.4s, %2.4s, %3.4s" : "=w"(result) @@ -5040,7 +5051,7 @@ FUNK int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return result; } -FUNK uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { +__funline uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { uint8x16_t result; __asm__("mla %0.16b, %2.16b, %3.16b" : "=w"(result) @@ -5049,7 +5060,7 @@ FUNK uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return result; } -FUNK uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { +__funline uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { uint16x8_t result; __asm__("mla %0.8h, %2.8h, %3.8h" : "=w"(result) @@ -5058,7 +5069,7 @@ FUNK uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return result; } -FUNK uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { +__funline uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { uint32x4_t result; __asm__("mla %0.4s, %2.4s, %3.4s" : "=w"(result) @@ -5067,7 +5078,7 @@ FUNK uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return result; } -FUNK float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) { +__funline float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) { float32x2_t result; float32x2_t t1; __asm__("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s" @@ -5077,7 +5088,7 @@ FUNK float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) { return result; } -FUNK int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) { +__funline int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) { int16x4_t result; __asm__("mls %0.4h, %2.4h, %3.h[0]" : "=w"(result) @@ -5086,7 +5097,7 @@ FUNK int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) { return result; } -FUNK int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) { +__funline int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) { int32x2_t result; __asm__("mls %0.2s, %2.2s, %3.s[0]" : "=w"(result) @@ -5095,7 +5106,7 @@ FUNK int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) { return result; } -FUNK uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { +__funline uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { uint16x4_t result; __asm__("mls %0.4h, %2.4h, %3.h[0]" : "=w"(result) @@ -5104,7 +5115,7 @@ FUNK uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { return result; } -FUNK uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { +__funline uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { uint32x2_t result; __asm__("mls %0.2s, %2.2s, %3.s[0]" : "=w"(result) @@ -5113,7 +5124,7 @@ FUNK uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { return result; } -FUNK int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) { +__funline int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) { int8x8_t result; __asm__("mls %0.8b,%2.8b,%3.8b" : "=w"(result) @@ -5122,7 +5133,7 @@ FUNK int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return result; } -FUNK int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) { +__funline int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) { int16x4_t result; __asm__("mls %0.4h,%2.4h,%3.4h" : "=w"(result) @@ -5131,7 +5142,7 @@ FUNK int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return result; } -FUNK int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) { +__funline int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) { int32x2_t result; __asm__("mls %0.2s,%2.2s,%3.2s" : "=w"(result) @@ -5140,7 +5151,7 @@ FUNK int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return result; } -FUNK uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { +__funline uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { uint8x8_t result; __asm__("mls %0.8b,%2.8b,%3.8b" : "=w"(result) @@ -5149,7 +5160,7 @@ FUNK uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return result; } -FUNK uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { +__funline uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { uint16x4_t result; __asm__("mls %0.4h,%2.4h,%3.4h" : "=w"(result) @@ -5158,7 +5169,7 @@ FUNK uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return result; } -FUNK uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { +__funline uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { uint32x2_t result; __asm__("mls %0.2s,%2.2s,%3.2s" : "=w"(result) @@ -5271,7 +5282,7 @@ FUNK uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { result; \ }) -FUNK int32x4_t vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { +__funline int32x4_t vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { int32x4_t result; __asm__("smlsl2 %0.4s, %2.8h, %3.h[0]" : "=w"(result) @@ -5280,7 +5291,7 @@ FUNK int32x4_t vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { return result; } -FUNK int64x2_t vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { +__funline int64x2_t vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { int64x2_t result; __asm__("smlsl2 %0.2d, %2.4s, %3.s[0]" : "=w"(result) @@ -5289,7 +5300,7 @@ FUNK int64x2_t vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { return result; } -FUNK uint32x4_t vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { +__funline uint32x4_t vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { uint32x4_t result; __asm__("umlsl2 %0.4s, %2.8h, %3.h[0]" : "=w"(result) @@ -5298,7 +5309,7 @@ FUNK uint32x4_t vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { return result; } -FUNK uint64x2_t vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { +__funline uint64x2_t vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { uint64x2_t result; __asm__("umlsl2 %0.2d, %2.4s, %3.s[0]" : "=w"(result) @@ -5307,7 +5318,7 @@ FUNK uint64x2_t vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { return result; } -FUNK int16x8_t vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { +__funline int16x8_t vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { int16x8_t result; __asm__("smlsl2 %0.8h,%2.16b,%3.16b" : "=w"(result) @@ -5316,7 +5327,7 @@ FUNK int16x8_t vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { return result; } -FUNK int32x4_t vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { +__funline int32x4_t vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { int32x4_t result; __asm__("smlsl2 %0.4s,%2.8h,%3.8h" : "=w"(result) @@ -5325,7 +5336,7 @@ FUNK int32x4_t vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return result; } -FUNK int64x2_t vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { +__funline int64x2_t vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { int64x2_t result; __asm__("smlsl2 %0.2d,%2.4s,%3.4s" : "=w"(result) @@ -5334,7 +5345,7 @@ FUNK int64x2_t vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return result; } -FUNK uint16x8_t vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { +__funline uint16x8_t vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { uint16x8_t result; __asm__("umlsl2 %0.8h,%2.16b,%3.16b" : "=w"(result) @@ -5343,7 +5354,7 @@ FUNK uint16x8_t vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return result; } -FUNK uint32x4_t vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { +__funline uint32x4_t vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { uint32x4_t result; __asm__("umlsl2 %0.4s,%2.8h,%3.8h" : "=w"(result) @@ -5352,7 +5363,7 @@ FUNK uint32x4_t vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return result; } -FUNK uint64x2_t vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { +__funline uint64x2_t vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { uint64x2_t result; __asm__("umlsl2 %0.2d,%2.4s,%3.4s" : "=w"(result) @@ -5465,7 +5476,7 @@ FUNK uint64x2_t vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { result; \ }) -FUNK int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { +__funline int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { int32x4_t result; __asm__("smlsl %0.4s, %2.4h, %3.h[0]" : "=w"(result) @@ -5474,7 +5485,7 @@ FUNK int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return result; } -FUNK int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { +__funline int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { int64x2_t result; __asm__("smlsl %0.2d, %2.2s, %3.s[0]" : "=w"(result) @@ -5483,7 +5494,7 @@ FUNK int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return result; } -FUNK uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { +__funline uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { uint32x4_t result; __asm__("umlsl %0.4s, %2.4h, %3.h[0]" : "=w"(result) @@ -5492,7 +5503,7 @@ FUNK uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { return result; } -FUNK uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { +__funline uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { uint64x2_t result; __asm__("umlsl %0.2d, %2.2s, %3.s[0]" : "=w"(result) @@ -5501,7 +5512,7 @@ FUNK uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { return result; } -FUNK int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { +__funline int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { int16x8_t result; __asm__("smlsl %0.8h, %2.8b, %3.8b" : "=w"(result) @@ -5510,7 +5521,7 @@ FUNK int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return result; } -FUNK int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { +__funline int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { int32x4_t result; __asm__("smlsl %0.4s, %2.4h, %3.4h" : "=w"(result) @@ -5519,7 +5530,7 @@ FUNK int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return result; } -FUNK int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { +__funline int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { int64x2_t result; __asm__("smlsl %0.2d, %2.2s, %3.2s" : "=w"(result) @@ -5528,7 +5539,7 @@ FUNK int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return result; } -FUNK uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { +__funline uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { uint16x8_t result; __asm__("umlsl %0.8h, %2.8b, %3.8b" : "=w"(result) @@ -5537,7 +5548,7 @@ FUNK uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return result; } -FUNK uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { +__funline uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { uint32x4_t result; __asm__("umlsl %0.4s, %2.4h, %3.4h" : "=w"(result) @@ -5546,7 +5557,7 @@ FUNK uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return result; } -FUNK uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { +__funline uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { uint64x2_t result; __asm__("umlsl %0.2d, %2.2s, %3.2s" : "=w"(result) @@ -5555,7 +5566,7 @@ FUNK uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return result; } -FUNK float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { +__funline float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { float32x4_t result; float32x4_t t1; __asm__("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s" @@ -5565,7 +5576,7 @@ FUNK float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { return result; } -FUNK int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { +__funline int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { int16x8_t result; __asm__("mls %0.8h, %2.8h, %3.h[0]" : "=w"(result) @@ -5574,7 +5585,7 @@ FUNK int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { return result; } -FUNK int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { +__funline int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { int32x4_t result; __asm__("mls %0.4s, %2.4s, %3.s[0]" : "=w"(result) @@ -5583,7 +5594,7 @@ FUNK int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { return result; } -FUNK uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { +__funline uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { uint16x8_t result; __asm__("mls %0.8h, %2.8h, %3.h[0]" : "=w"(result) @@ -5592,7 +5603,7 @@ FUNK uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { return result; } -FUNK uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { +__funline uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { uint32x4_t result; __asm__("mls %0.4s, %2.4s, %3.s[0]" : "=w"(result) @@ -5601,7 +5612,7 @@ FUNK uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { return result; } -FUNK int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { +__funline int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { int8x16_t result; __asm__("mls %0.16b,%2.16b,%3.16b" : "=w"(result) @@ -5610,7 +5621,7 @@ FUNK int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return result; } -FUNK int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { +__funline int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { int16x8_t result; __asm__("mls %0.8h,%2.8h,%3.8h" : "=w"(result) @@ -5619,7 +5630,7 @@ FUNK int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return result; } -FUNK int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { +__funline int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { int32x4_t result; __asm__("mls %0.4s,%2.4s,%3.4s" : "=w"(result) @@ -5628,7 +5639,7 @@ FUNK int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return result; } -FUNK uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { +__funline uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { uint8x16_t result; __asm__("mls %0.16b,%2.16b,%3.16b" : "=w"(result) @@ -5637,7 +5648,7 @@ FUNK uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return result; } -FUNK uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { +__funline uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { uint16x8_t result; __asm__("mls %0.8h,%2.8h,%3.8h" : "=w"(result) @@ -5646,7 +5657,7 @@ FUNK uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return result; } -FUNK uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { +__funline uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { uint32x4_t result; __asm__("mls %0.4s,%2.4s,%3.4s" : "=w"(result) @@ -5655,145 +5666,145 @@ FUNK uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return result; } -FUNK int16x8_t vmovl_high_s8(int8x16_t a) { +__funline int16x8_t vmovl_high_s8(int8x16_t a) { int16x8_t result; __asm__("sshll2 %0.8h,%1.16b,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int32x4_t vmovl_high_s16(int16x8_t a) { +__funline int32x4_t vmovl_high_s16(int16x8_t a) { int32x4_t result; __asm__("sshll2 %0.4s,%1.8h,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int64x2_t vmovl_high_s32(int32x4_t a) { +__funline int64x2_t vmovl_high_s32(int32x4_t a) { int64x2_t result; __asm__("sshll2 %0.2d,%1.4s,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint16x8_t vmovl_high_u8(uint8x16_t a) { +__funline uint16x8_t vmovl_high_u8(uint8x16_t a) { uint16x8_t result; __asm__("ushll2 %0.8h,%1.16b,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint32x4_t vmovl_high_u16(uint16x8_t a) { +__funline uint32x4_t vmovl_high_u16(uint16x8_t a) { uint32x4_t result; __asm__("ushll2 %0.4s,%1.8h,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint64x2_t vmovl_high_u32(uint32x4_t a) { +__funline uint64x2_t vmovl_high_u32(uint32x4_t a) { uint64x2_t result; __asm__("ushll2 %0.2d,%1.4s,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int16x8_t vmovl_s8(int8x8_t a) { +__funline int16x8_t vmovl_s8(int8x8_t a) { int16x8_t result; __asm__("sshll %0.8h,%1.8b,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int32x4_t vmovl_s16(int16x4_t a) { +__funline int32x4_t vmovl_s16(int16x4_t a) { int32x4_t result; __asm__("sshll %0.4s,%1.4h,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int64x2_t vmovl_s32(int32x2_t a) { +__funline int64x2_t vmovl_s32(int32x2_t a) { int64x2_t result; __asm__("sshll %0.2d,%1.2s,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint16x8_t vmovl_u8(uint8x8_t a) { +__funline uint16x8_t vmovl_u8(uint8x8_t a) { uint16x8_t result; __asm__("ushll %0.8h,%1.8b,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint32x4_t vmovl_u16(uint16x4_t a) { +__funline uint32x4_t vmovl_u16(uint16x4_t a) { uint32x4_t result; __asm__("ushll %0.4s,%1.4h,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint64x2_t vmovl_u32(uint32x2_t a) { +__funline uint64x2_t vmovl_u32(uint32x2_t a) { uint64x2_t result; __asm__("ushll %0.2d,%1.2s,#0" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int8x16_t vmovn_high_s16(int8x8_t a, int16x8_t b) { +__funline int8x16_t vmovn_high_s16(int8x8_t a, int16x8_t b) { int8x16_t result = vcombine_s8(a, vcreate_s8(__AARCH64_UINT64_C(0x0))); __asm__("xtn2 %0.16b,%1.8h" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK int16x8_t vmovn_high_s32(int16x4_t a, int32x4_t b) { +__funline int16x8_t vmovn_high_s32(int16x4_t a, int32x4_t b) { int16x8_t result = vcombine_s16(a, vcreate_s16(__AARCH64_UINT64_C(0x0))); __asm__("xtn2 %0.8h,%1.4s" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK int32x4_t vmovn_high_s64(int32x2_t a, int64x2_t b) { +__funline int32x4_t vmovn_high_s64(int32x2_t a, int64x2_t b) { int32x4_t result = vcombine_s32(a, vcreate_s32(__AARCH64_UINT64_C(0x0))); __asm__("xtn2 %0.4s,%1.2d" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK uint8x16_t vmovn_high_u16(uint8x8_t a, uint16x8_t b) { +__funline uint8x16_t vmovn_high_u16(uint8x8_t a, uint16x8_t b) { uint8x16_t result = vcombine_u8(a, vcreate_u8(__AARCH64_UINT64_C(0x0))); __asm__("xtn2 %0.16b,%1.8h" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK uint16x8_t vmovn_high_u32(uint16x4_t a, uint32x4_t b) { +__funline uint16x8_t vmovn_high_u32(uint16x4_t a, uint32x4_t b) { uint16x8_t result = vcombine_u16(a, vcreate_u16(__AARCH64_UINT64_C(0x0))); __asm__("xtn2 %0.8h,%1.4s" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK uint32x4_t vmovn_high_u64(uint32x2_t a, uint64x2_t b) { +__funline uint32x4_t vmovn_high_u64(uint32x2_t a, uint64x2_t b) { uint32x4_t result = vcombine_u32(a, vcreate_u32(__AARCH64_UINT64_C(0x0))); __asm__("xtn2 %0.4s,%1.2d" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK int8x8_t vmovn_s16(int16x8_t a) { +__funline int8x8_t vmovn_s16(int16x8_t a) { int8x8_t result; __asm__("xtn %0.8b,%1.8h" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int16x4_t vmovn_s32(int32x4_t a) { +__funline int16x4_t vmovn_s32(int32x4_t a) { int16x4_t result; __asm__("xtn %0.4h,%1.4s" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int32x2_t vmovn_s64(int64x2_t a) { +__funline int32x2_t vmovn_s64(int64x2_t a) { int32x2_t result; __asm__("xtn %0.2s,%1.2d" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint8x8_t vmovn_u16(uint16x8_t a) { +__funline uint8x8_t vmovn_u16(uint16x8_t a) { uint8x8_t result; __asm__("xtn %0.8b,%1.8h" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint16x4_t vmovn_u32(uint32x4_t a) { +__funline uint16x4_t vmovn_u32(uint32x4_t a) { uint16x4_t result; __asm__("xtn %0.4h,%1.4s" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint32x2_t vmovn_u64(uint64x2_t a) { +__funline uint32x2_t vmovn_u64(uint64x2_t a) { uint32x2_t result; __asm__("xtn %0.2s,%1.2d" : "=w"(result) : "w"(a) : /* No clobbers */); return result; @@ -5895,7 +5906,7 @@ FUNK uint32x2_t vmovn_u64(uint64x2_t a) { result; \ }) -FUNK int32x4_t vmull_high_n_s16(int16x8_t a, int16_t b) { +__funline int32x4_t vmull_high_n_s16(int16x8_t a, int16_t b) { int32x4_t result; __asm__("smull2 %0.4s,%1.8h,%2.h[0]" : "=w"(result) @@ -5904,7 +5915,7 @@ FUNK int32x4_t vmull_high_n_s16(int16x8_t a, int16_t b) { return result; } -FUNK int64x2_t vmull_high_n_s32(int32x4_t a, int32_t b) { +__funline int64x2_t vmull_high_n_s32(int32x4_t a, int32_t b) { int64x2_t result; __asm__("smull2 %0.2d,%1.4s,%2.s[0]" : "=w"(result) @@ -5913,7 +5924,7 @@ FUNK int64x2_t vmull_high_n_s32(int32x4_t a, int32_t b) { return result; } -FUNK uint32x4_t vmull_high_n_u16(uint16x8_t a, uint16_t b) { +__funline uint32x4_t vmull_high_n_u16(uint16x8_t a, uint16_t b) { uint32x4_t result; __asm__("umull2 %0.4s,%1.8h,%2.h[0]" : "=w"(result) @@ -5922,7 +5933,7 @@ FUNK uint32x4_t vmull_high_n_u16(uint16x8_t a, uint16_t b) { return result; } -FUNK uint64x2_t vmull_high_n_u32(uint32x4_t a, uint32_t b) { +__funline uint64x2_t vmull_high_n_u32(uint32x4_t a, uint32_t b) { uint64x2_t result; __asm__("umull2 %0.2d,%1.4s,%2.s[0]" : "=w"(result) @@ -5931,7 +5942,7 @@ FUNK uint64x2_t vmull_high_n_u32(uint32x4_t a, uint32_t b) { return result; } -FUNK poly16x8_t vmull_high_p8(poly8x16_t a, poly8x16_t b) { +__funline poly16x8_t vmull_high_p8(poly8x16_t a, poly8x16_t b) { poly16x8_t result; __asm__("pmull2 %0.8h,%1.16b,%2.16b" : "=w"(result) @@ -5940,7 +5951,7 @@ FUNK poly16x8_t vmull_high_p8(poly8x16_t a, poly8x16_t b) { return result; } -FUNK int16x8_t vmull_high_s8(int8x16_t a, int8x16_t b) { +__funline int16x8_t vmull_high_s8(int8x16_t a, int8x16_t b) { int16x8_t result; __asm__("smull2 %0.8h,%1.16b,%2.16b" : "=w"(result) @@ -5949,7 +5960,7 @@ FUNK int16x8_t vmull_high_s8(int8x16_t a, int8x16_t b) { return result; } -FUNK int32x4_t vmull_high_s16(int16x8_t a, int16x8_t b) { +__funline int32x4_t vmull_high_s16(int16x8_t a, int16x8_t b) { int32x4_t result; __asm__("smull2 %0.4s,%1.8h,%2.8h" : "=w"(result) @@ -5958,7 +5969,7 @@ FUNK int32x4_t vmull_high_s16(int16x8_t a, int16x8_t b) { return result; } -FUNK int64x2_t vmull_high_s32(int32x4_t a, int32x4_t b) { +__funline int64x2_t vmull_high_s32(int32x4_t a, int32x4_t b) { int64x2_t result; __asm__("smull2 %0.2d,%1.4s,%2.4s" : "=w"(result) @@ -5967,7 +5978,7 @@ FUNK int64x2_t vmull_high_s32(int32x4_t a, int32x4_t b) { return result; } -FUNK uint16x8_t vmull_high_u8(uint8x16_t a, uint8x16_t b) { +__funline uint16x8_t vmull_high_u8(uint8x16_t a, uint8x16_t b) { uint16x8_t result; __asm__("umull2 %0.8h,%1.16b,%2.16b" : "=w"(result) @@ -5976,7 +5987,7 @@ FUNK uint16x8_t vmull_high_u8(uint8x16_t a, uint8x16_t b) { return result; } -FUNK uint32x4_t vmull_high_u16(uint16x8_t a, uint16x8_t b) { +__funline uint32x4_t vmull_high_u16(uint16x8_t a, uint16x8_t b) { uint32x4_t result; __asm__("umull2 %0.4s,%1.8h,%2.8h" : "=w"(result) @@ -5985,7 +5996,7 @@ FUNK uint32x4_t vmull_high_u16(uint16x8_t a, uint16x8_t b) { return result; } -FUNK uint64x2_t vmull_high_u32(uint32x4_t a, uint32x4_t b) { +__funline uint64x2_t vmull_high_u32(uint32x4_t a, uint32x4_t b) { uint64x2_t result; __asm__("umull2 %0.2d,%1.4s,%2.4s" : "=w"(result) @@ -6090,7 +6101,7 @@ FUNK uint64x2_t vmull_high_u32(uint32x4_t a, uint32x4_t b) { result; \ }) -FUNK int32x4_t vmull_n_s16(int16x4_t a, int16_t b) { +__funline int32x4_t vmull_n_s16(int16x4_t a, int16_t b) { int32x4_t result; __asm__("smull %0.4s,%1.4h,%2.h[0]" : "=w"(result) @@ -6099,7 +6110,7 @@ FUNK int32x4_t vmull_n_s16(int16x4_t a, int16_t b) { return result; } -FUNK int64x2_t vmull_n_s32(int32x2_t a, int32_t b) { +__funline int64x2_t vmull_n_s32(int32x2_t a, int32_t b) { int64x2_t result; __asm__("smull %0.2d,%1.2s,%2.s[0]" : "=w"(result) @@ -6108,7 +6119,7 @@ FUNK int64x2_t vmull_n_s32(int32x2_t a, int32_t b) { return result; } -FUNK uint32x4_t vmull_n_u16(uint16x4_t a, uint16_t b) { +__funline uint32x4_t vmull_n_u16(uint16x4_t a, uint16_t b) { uint32x4_t result; __asm__("umull %0.4s,%1.4h,%2.h[0]" : "=w"(result) @@ -6117,7 +6128,7 @@ FUNK uint32x4_t vmull_n_u16(uint16x4_t a, uint16_t b) { return result; } -FUNK uint64x2_t vmull_n_u32(uint32x2_t a, uint32_t b) { +__funline uint64x2_t vmull_n_u32(uint32x2_t a, uint32_t b) { uint64x2_t result; __asm__("umull %0.2d,%1.2s,%2.s[0]" : "=w"(result) @@ -6126,7 +6137,7 @@ FUNK uint64x2_t vmull_n_u32(uint32x2_t a, uint32_t b) { return result; } -FUNK poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) { +__funline poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) { poly16x8_t result; __asm__("pmull %0.8h, %1.8b, %2.8b" : "=w"(result) @@ -6135,7 +6146,7 @@ FUNK poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) { return result; } -FUNK int16x8_t vmull_s8(int8x8_t a, int8x8_t b) { +__funline int16x8_t vmull_s8(int8x8_t a, int8x8_t b) { int16x8_t result; __asm__("smull %0.8h, %1.8b, %2.8b" : "=w"(result) @@ -6144,7 +6155,7 @@ FUNK int16x8_t vmull_s8(int8x8_t a, int8x8_t b) { return result; } -FUNK int32x4_t vmull_s16(int16x4_t a, int16x4_t b) { +__funline int32x4_t vmull_s16(int16x4_t a, int16x4_t b) { int32x4_t result; __asm__("smull %0.4s, %1.4h, %2.4h" : "=w"(result) @@ -6153,7 +6164,7 @@ FUNK int32x4_t vmull_s16(int16x4_t a, int16x4_t b) { return result; } -FUNK int64x2_t vmull_s32(int32x2_t a, int32x2_t b) { +__funline int64x2_t vmull_s32(int32x2_t a, int32x2_t b) { int64x2_t result; __asm__("smull %0.2d, %1.2s, %2.2s" : "=w"(result) @@ -6162,7 +6173,7 @@ FUNK int64x2_t vmull_s32(int32x2_t a, int32x2_t b) { return result; } -FUNK uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) { +__funline uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) { uint16x8_t result; __asm__("umull %0.8h, %1.8b, %2.8b" : "=w"(result) @@ -6171,7 +6182,7 @@ FUNK uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) { return result; } -FUNK uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) { +__funline uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) { uint32x4_t result; __asm__("umull %0.4s, %1.4h, %2.4h" : "=w"(result) @@ -6180,7 +6191,7 @@ FUNK uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) { return result; } -FUNK uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) { +__funline uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) { uint64x2_t result; __asm__("umull %0.2d, %1.2s, %2.2s" : "=w"(result) @@ -6189,7 +6200,7 @@ FUNK uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) { return result; } -FUNK int16x4_t vpadal_s8(int16x4_t a, int8x8_t b) { +__funline int16x4_t vpadal_s8(int16x4_t a, int8x8_t b) { int16x4_t result; __asm__("sadalp %0.4h,%2.8b" : "=w"(result) @@ -6198,7 +6209,7 @@ FUNK int16x4_t vpadal_s8(int16x4_t a, int8x8_t b) { return result; } -FUNK int32x2_t vpadal_s16(int32x2_t a, int16x4_t b) { +__funline int32x2_t vpadal_s16(int32x2_t a, int16x4_t b) { int32x2_t result; __asm__("sadalp %0.2s,%2.4h" : "=w"(result) @@ -6207,7 +6218,7 @@ FUNK int32x2_t vpadal_s16(int32x2_t a, int16x4_t b) { return result; } -FUNK int64x1_t vpadal_s32(int64x1_t a, int32x2_t b) { +__funline int64x1_t vpadal_s32(int64x1_t a, int32x2_t b) { int64x1_t result; __asm__("sadalp %0.1d,%2.2s" : "=w"(result) @@ -6216,7 +6227,7 @@ FUNK int64x1_t vpadal_s32(int64x1_t a, int32x2_t b) { return result; } -FUNK uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b) { +__funline uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b) { uint16x4_t result; __asm__("uadalp %0.4h,%2.8b" : "=w"(result) @@ -6225,7 +6236,7 @@ FUNK uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b) { return result; } -FUNK uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b) { +__funline uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b) { uint32x2_t result; __asm__("uadalp %0.2s,%2.4h" : "=w"(result) @@ -6234,7 +6245,7 @@ FUNK uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b) { return result; } -FUNK uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b) { +__funline uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b) { uint64x1_t result; __asm__("uadalp %0.1d,%2.2s" : "=w"(result) @@ -6243,7 +6254,7 @@ FUNK uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b) { return result; } -FUNK int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) { +__funline int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) { int16x8_t result; __asm__("sadalp %0.8h,%2.16b" : "=w"(result) @@ -6252,7 +6263,7 @@ FUNK int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) { return result; } -FUNK int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) { +__funline int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) { int32x4_t result; __asm__("sadalp %0.4s,%2.8h" : "=w"(result) @@ -6261,7 +6272,7 @@ FUNK int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) { return result; } -FUNK int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) { +__funline int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) { int64x2_t result; __asm__("sadalp %0.2d,%2.4s" : "=w"(result) @@ -6270,7 +6281,7 @@ FUNK int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) { return result; } -FUNK uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) { +__funline uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) { uint16x8_t result; __asm__("uadalp %0.8h,%2.16b" : "=w"(result) @@ -6279,7 +6290,7 @@ FUNK uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) { return result; } -FUNK uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b) { +__funline uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b) { uint32x4_t result; __asm__("uadalp %0.4s,%2.8h" : "=w"(result) @@ -6288,7 +6299,7 @@ FUNK uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b) { return result; } -FUNK uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b) { +__funline uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b) { uint64x2_t result; __asm__("uadalp %0.2d,%2.4s" : "=w"(result) @@ -6297,79 +6308,79 @@ FUNK uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b) { return result; } -FUNK int16x4_t vpaddl_s8(int8x8_t a) { +__funline int16x4_t vpaddl_s8(int8x8_t a) { int16x4_t result; __asm__("saddlp %0.4h,%1.8b" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int32x2_t vpaddl_s16(int16x4_t a) { +__funline int32x2_t vpaddl_s16(int16x4_t a) { int32x2_t result; __asm__("saddlp %0.2s,%1.4h" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int64x1_t vpaddl_s32(int32x2_t a) { +__funline int64x1_t vpaddl_s32(int32x2_t a) { int64x1_t result; __asm__("saddlp %0.1d,%1.2s" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint16x4_t vpaddl_u8(uint8x8_t a) { +__funline uint16x4_t vpaddl_u8(uint8x8_t a) { uint16x4_t result; __asm__("uaddlp %0.4h,%1.8b" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint32x2_t vpaddl_u16(uint16x4_t a) { +__funline uint32x2_t vpaddl_u16(uint16x4_t a) { uint32x2_t result; __asm__("uaddlp %0.2s,%1.4h" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint64x1_t vpaddl_u32(uint32x2_t a) { +__funline uint64x1_t vpaddl_u32(uint32x2_t a) { uint64x1_t result; __asm__("uaddlp %0.1d,%1.2s" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int16x8_t vpaddlq_s8(int8x16_t a) { +__funline int16x8_t vpaddlq_s8(int8x16_t a) { int16x8_t result; __asm__("saddlp %0.8h,%1.16b" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int32x4_t vpaddlq_s16(int16x8_t a) { +__funline int32x4_t vpaddlq_s16(int16x8_t a) { int32x4_t result; __asm__("saddlp %0.4s,%1.8h" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int64x2_t vpaddlq_s32(int32x4_t a) { +__funline int64x2_t vpaddlq_s32(int32x4_t a) { int64x2_t result; __asm__("saddlp %0.2d,%1.4s" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint16x8_t vpaddlq_u8(uint8x16_t a) { +__funline uint16x8_t vpaddlq_u8(uint8x16_t a) { uint16x8_t result; __asm__("uaddlp %0.8h,%1.16b" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint32x4_t vpaddlq_u16(uint16x8_t a) { +__funline uint32x4_t vpaddlq_u16(uint16x8_t a) { uint32x4_t result; __asm__("uaddlp %0.4s,%1.8h" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK uint64x2_t vpaddlq_u32(uint32x4_t a) { +__funline uint64x2_t vpaddlq_u32(uint32x4_t a) { uint64x2_t result; __asm__("uaddlp %0.2d,%1.4s" : "=w"(result) : "w"(a) : /* No clobbers */); return result; } -FUNK int8x16_t vpaddq_s8(int8x16_t a, int8x16_t b) { +__funline int8x16_t vpaddq_s8(int8x16_t a, int8x16_t b) { int8x16_t result; __asm__("addp %0.16b,%1.16b,%2.16b" : "=w"(result) @@ -6378,7 +6389,7 @@ FUNK int8x16_t vpaddq_s8(int8x16_t a, int8x16_t b) { return result; } -FUNK int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { +__funline int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { int16x8_t result; __asm__("addp %0.8h,%1.8h,%2.8h" : "=w"(result) @@ -6387,7 +6398,7 @@ FUNK int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { return result; } -FUNK int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { +__funline int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { int32x4_t result; __asm__("addp %0.4s,%1.4s,%2.4s" : "=w"(result) @@ -6396,7 +6407,7 @@ FUNK int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { return result; } -FUNK int64x2_t vpaddq_s64(int64x2_t a, int64x2_t b) { +__funline int64x2_t vpaddq_s64(int64x2_t a, int64x2_t b) { int64x2_t result; __asm__("addp %0.2d,%1.2d,%2.2d" : "=w"(result) @@ -6405,7 +6416,7 @@ FUNK int64x2_t vpaddq_s64(int64x2_t a, int64x2_t b) { return result; } -FUNK uint8x16_t vpaddq_u8(uint8x16_t a, uint8x16_t b) { +__funline uint8x16_t vpaddq_u8(uint8x16_t a, uint8x16_t b) { uint8x16_t result; __asm__("addp %0.16b,%1.16b,%2.16b" : "=w"(result) @@ -6414,7 +6425,7 @@ FUNK uint8x16_t vpaddq_u8(uint8x16_t a, uint8x16_t b) { return result; } -FUNK uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { +__funline uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { uint16x8_t result; __asm__("addp %0.8h,%1.8h,%2.8h" : "=w"(result) @@ -6423,7 +6434,7 @@ FUNK uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) { return result; } -FUNK uint32x4_t vpaddq_u32(uint32x4_t a, uint32x4_t b) { +__funline uint32x4_t vpaddq_u32(uint32x4_t a, uint32x4_t b) { uint32x4_t result; __asm__("addp %0.4s,%1.4s,%2.4s" : "=w"(result) @@ -6432,7 +6443,7 @@ FUNK uint32x4_t vpaddq_u32(uint32x4_t a, uint32x4_t b) { return result; } -FUNK uint64x2_t vpaddq_u64(uint64x2_t a, uint64x2_t b) { +__funline uint64x2_t vpaddq_u64(uint64x2_t a, uint64x2_t b) { uint64x2_t result; __asm__("addp %0.2d,%1.2d,%2.2d" : "=w"(result) @@ -6441,7 +6452,7 @@ FUNK uint64x2_t vpaddq_u64(uint64x2_t a, uint64x2_t b) { return result; } -FUNK int16x4_t vqdmulh_n_s16(int16x4_t a, int16_t b) { +__funline int16x4_t vqdmulh_n_s16(int16x4_t a, int16_t b) { int16x4_t result; __asm__("sqdmulh %0.4h,%1.4h,%2.h[0]" : "=w"(result) @@ -6450,7 +6461,7 @@ FUNK int16x4_t vqdmulh_n_s16(int16x4_t a, int16_t b) { return result; } -FUNK int32x2_t vqdmulh_n_s32(int32x2_t a, int32_t b) { +__funline int32x2_t vqdmulh_n_s32(int32x2_t a, int32_t b) { int32x2_t result; __asm__("sqdmulh %0.2s,%1.2s,%2.s[0]" : "=w"(result) @@ -6459,7 +6470,7 @@ FUNK int32x2_t vqdmulh_n_s32(int32x2_t a, int32_t b) { return result; } -FUNK int16x8_t vqdmulhq_n_s16(int16x8_t a, int16_t b) { +__funline int16x8_t vqdmulhq_n_s16(int16x8_t a, int16_t b) { int16x8_t result; __asm__("sqdmulh %0.8h,%1.8h,%2.h[0]" : "=w"(result) @@ -6468,7 +6479,7 @@ FUNK int16x8_t vqdmulhq_n_s16(int16x8_t a, int16_t b) { return result; } -FUNK int32x4_t vqdmulhq_n_s32(int32x4_t a, int32_t b) { +__funline int32x4_t vqdmulhq_n_s32(int32x4_t a, int32_t b) { int32x4_t result; __asm__("sqdmulh %0.4s,%1.4s,%2.s[0]" : "=w"(result) @@ -6477,61 +6488,61 @@ FUNK int32x4_t vqdmulhq_n_s32(int32x4_t a, int32_t b) { return result; } -FUNK int8x16_t vqmovn_high_s16(int8x8_t a, int16x8_t b) { +__funline int8x16_t vqmovn_high_s16(int8x8_t a, int16x8_t b) { int8x16_t result = vcombine_s8(a, vcreate_s8(__AARCH64_UINT64_C(0x0))); __asm__("sqxtn2 %0.16b, %1.8h" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK int16x8_t vqmovn_high_s32(int16x4_t a, int32x4_t b) { +__funline int16x8_t vqmovn_high_s32(int16x4_t a, int32x4_t b) { int16x8_t result = vcombine_s16(a, vcreate_s16(__AARCH64_UINT64_C(0x0))); __asm__("sqxtn2 %0.8h, %1.4s" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK int32x4_t vqmovn_high_s64(int32x2_t a, int64x2_t b) { +__funline int32x4_t vqmovn_high_s64(int32x2_t a, int64x2_t b) { int32x4_t result = vcombine_s32(a, vcreate_s32(__AARCH64_UINT64_C(0x0))); __asm__("sqxtn2 %0.4s, %1.2d" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK uint8x16_t vqmovn_high_u16(uint8x8_t a, uint16x8_t b) { +__funline uint8x16_t vqmovn_high_u16(uint8x8_t a, uint16x8_t b) { uint8x16_t result = vcombine_u8(a, vcreate_u8(__AARCH64_UINT64_C(0x0))); __asm__("uqxtn2 %0.16b, %1.8h" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK uint16x8_t vqmovn_high_u32(uint16x4_t a, uint32x4_t b) { +__funline uint16x8_t vqmovn_high_u32(uint16x4_t a, uint32x4_t b) { uint16x8_t result = vcombine_u16(a, vcreate_u16(__AARCH64_UINT64_C(0x0))); __asm__("uqxtn2 %0.8h, %1.4s" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK uint32x4_t vqmovn_high_u64(uint32x2_t a, uint64x2_t b) { +__funline uint32x4_t vqmovn_high_u64(uint32x2_t a, uint64x2_t b) { uint32x4_t result = vcombine_u32(a, vcreate_u32(__AARCH64_UINT64_C(0x0))); __asm__("uqxtn2 %0.4s, %1.2d" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK uint8x16_t vqmovun_high_s16(uint8x8_t a, int16x8_t b) { +__funline uint8x16_t vqmovun_high_s16(uint8x8_t a, int16x8_t b) { uint8x16_t result = vcombine_u8(a, vcreate_u8(__AARCH64_UINT64_C(0x0))); __asm__("sqxtun2 %0.16b, %1.8h" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK uint16x8_t vqmovun_high_s32(uint16x4_t a, int32x4_t b) { +__funline uint16x8_t vqmovun_high_s32(uint16x4_t a, int32x4_t b) { uint16x8_t result = vcombine_u16(a, vcreate_u16(__AARCH64_UINT64_C(0x0))); __asm__("sqxtun2 %0.8h, %1.4s" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK uint32x4_t vqmovun_high_s64(uint32x2_t a, int64x2_t b) { +__funline uint32x4_t vqmovun_high_s64(uint32x2_t a, int64x2_t b) { uint32x4_t result = vcombine_u32(a, vcreate_u32(__AARCH64_UINT64_C(0x0))); __asm__("sqxtun2 %0.4s, %1.2d" : "+w"(result) : "w"(b) : /* No clobbers */); return result; } -FUNK int16x4_t vqrdmulh_n_s16(int16x4_t a, int16_t b) { +__funline int16x4_t vqrdmulh_n_s16(int16x4_t a, int16_t b) { int16x4_t result; __asm__("sqrdmulh %0.4h,%1.4h,%2.h[0]" : "=w"(result) @@ -6540,7 +6551,7 @@ FUNK int16x4_t vqrdmulh_n_s16(int16x4_t a, int16_t b) { return result; } -FUNK int32x2_t vqrdmulh_n_s32(int32x2_t a, int32_t b) { +__funline int32x2_t vqrdmulh_n_s32(int32x2_t a, int32_t b) { int32x2_t result; __asm__("sqrdmulh %0.2s,%1.2s,%2.s[0]" : "=w"(result) @@ -6549,7 +6560,7 @@ FUNK int32x2_t vqrdmulh_n_s32(int32x2_t a, int32_t b) { return result; } -FUNK int16x8_t vqrdmulhq_n_s16(int16x8_t a, int16_t b) { +__funline int16x8_t vqrdmulhq_n_s16(int16x8_t a, int16_t b) { int16x8_t result; __asm__("sqrdmulh %0.8h,%1.8h,%2.h[0]" : "=w"(result) @@ -6558,7 +6569,7 @@ FUNK int16x8_t vqrdmulhq_n_s16(int16x8_t a, int16_t b) { return result; } -FUNK int32x4_t vqrdmulhq_n_s32(int32x4_t a, int32_t b) { +__funline int32x4_t vqrdmulhq_n_s32(int32x4_t a, int32_t b) { int32x4_t result; __asm__("sqrdmulh %0.4s,%1.4s,%2.s[0]" : "=w"(result) @@ -7511,53 +7522,59 @@ __ST4_LANE_FUNC(uint16x8x4_t, uint16_t, v8hi, hi, u16) __ST4_LANE_FUNC(uint32x4x4_t, uint32_t, v4si, si, u32) __ST4_LANE_FUNC(uint64x2x4_t, uint64_t, v2di, di, u64) -FUNK int64_t vaddlv_s32(int32x2_t a) { +__funline int64_t vaddlv_s32(int32x2_t a) { int64_t result; __asm__("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) :); return result; } -FUNK uint64_t vaddlv_u32(uint32x2_t a) { +__funline uint64_t vaddlv_u32(uint32x2_t a) { uint64_t result; __asm__("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) :); return result; } -FUNK int16x4_t vqdmulh_laneq_s16(int16x4_t __a, int16x8_t __b, const int __c) { +__funline int16x4_t vqdmulh_laneq_s16(int16x4_t __a, int16x8_t __b, + const int __c) { return __builtin_aarch64_sqdmulh_laneqv4hi(__a, __b, __c); } -FUNK int32x2_t vqdmulh_laneq_s32(int32x2_t __a, int32x4_t __b, const int __c) { +__funline int32x2_t vqdmulh_laneq_s32(int32x2_t __a, int32x4_t __b, + const int __c) { return __builtin_aarch64_sqdmulh_laneqv2si(__a, __b, __c); } -FUNK int16x8_t vqdmulhq_laneq_s16(int16x8_t __a, int16x8_t __b, const int __c) { +__funline int16x8_t vqdmulhq_laneq_s16(int16x8_t __a, int16x8_t __b, + const int __c) { return __builtin_aarch64_sqdmulh_laneqv8hi(__a, __b, __c); } -FUNK int32x4_t vqdmulhq_laneq_s32(int32x4_t __a, int32x4_t __b, const int __c) { +__funline int32x4_t vqdmulhq_laneq_s32(int32x4_t __a, int32x4_t __b, + const int __c) { return __builtin_aarch64_sqdmulh_laneqv4si(__a, __b, __c); } -FUNK int16x4_t vqrdmulh_laneq_s16(int16x4_t __a, int16x8_t __b, const int __c) { +__funline int16x4_t vqrdmulh_laneq_s16(int16x4_t __a, int16x8_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_laneqv4hi(__a, __b, __c); } -FUNK int32x2_t vqrdmulh_laneq_s32(int32x2_t __a, int32x4_t __b, const int __c) { +__funline int32x2_t vqrdmulh_laneq_s32(int32x2_t __a, int32x4_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_laneqv2si(__a, __b, __c); } -FUNK int16x8_t vqrdmulhq_laneq_s16(int16x8_t __a, int16x8_t __b, - const int __c) { +__funline int16x8_t vqrdmulhq_laneq_s16(int16x8_t __a, int16x8_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_laneqv8hi(__a, __b, __c); } -FUNK int32x4_t vqrdmulhq_laneq_s32(int32x4_t __a, int32x4_t __b, - const int __c) { +__funline int32x4_t vqrdmulhq_laneq_s32(int32x4_t __a, int32x4_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_laneqv4si(__a, __b, __c); } -FUNK poly8x8_t vqtbl1_p8(poly8x16_t a, uint8x8_t b) { +__funline poly8x8_t vqtbl1_p8(poly8x16_t a, uint8x8_t b) { poly8x8_t result; __asm__("tbl %0.8b, {%1.16b}, %2.8b" : "=w"(result) @@ -7566,7 +7583,7 @@ FUNK poly8x8_t vqtbl1_p8(poly8x16_t a, uint8x8_t b) { return result; } -FUNK int8x8_t vqtbl1_s8(int8x16_t a, uint8x8_t b) { +__funline int8x8_t vqtbl1_s8(int8x16_t a, uint8x8_t b) { int8x8_t result; __asm__("tbl %0.8b, {%1.16b}, %2.8b" : "=w"(result) @@ -7575,7 +7592,7 @@ FUNK int8x8_t vqtbl1_s8(int8x16_t a, uint8x8_t b) { return result; } -FUNK uint8x8_t vqtbl1_u8(uint8x16_t a, uint8x8_t b) { +__funline uint8x8_t vqtbl1_u8(uint8x16_t a, uint8x8_t b) { uint8x8_t result; __asm__("tbl %0.8b, {%1.16b}, %2.8b" : "=w"(result) @@ -7584,7 +7601,7 @@ FUNK uint8x8_t vqtbl1_u8(uint8x16_t a, uint8x8_t b) { return result; } -FUNK poly8x16_t vqtbl1q_p8(poly8x16_t a, uint8x16_t b) { +__funline poly8x16_t vqtbl1q_p8(poly8x16_t a, uint8x16_t b) { poly8x16_t result; __asm__("tbl %0.16b, {%1.16b}, %2.16b" : "=w"(result) @@ -7593,7 +7610,7 @@ FUNK poly8x16_t vqtbl1q_p8(poly8x16_t a, uint8x16_t b) { return result; } -FUNK int8x16_t vqtbl1q_s8(int8x16_t a, uint8x16_t b) { +__funline int8x16_t vqtbl1q_s8(int8x16_t a, uint8x16_t b) { int8x16_t result; __asm__("tbl %0.16b, {%1.16b}, %2.16b" : "=w"(result) @@ -7602,7 +7619,7 @@ FUNK int8x16_t vqtbl1q_s8(int8x16_t a, uint8x16_t b) { return result; } -FUNK uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { +__funline uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { uint8x16_t result; __asm__("tbl %0.16b, {%1.16b}, %2.16b" : "=w"(result) @@ -7611,7 +7628,7 @@ FUNK uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { return result; } -FUNK int8x8_t vqtbx1_s8(int8x8_t r, int8x16_t tab, uint8x8_t idx) { +__funline int8x8_t vqtbx1_s8(int8x8_t r, int8x16_t tab, uint8x8_t idx) { int8x8_t result = r; __asm__("tbx %0.8b,{%1.16b},%2.8b" : "+w"(result) @@ -7620,7 +7637,7 @@ FUNK int8x8_t vqtbx1_s8(int8x8_t r, int8x16_t tab, uint8x8_t idx) { return result; } -FUNK uint8x8_t vqtbx1_u8(uint8x8_t r, uint8x16_t tab, uint8x8_t idx) { +__funline uint8x8_t vqtbx1_u8(uint8x8_t r, uint8x16_t tab, uint8x8_t idx) { uint8x8_t result = r; __asm__("tbx %0.8b,{%1.16b},%2.8b" : "+w"(result) @@ -7629,7 +7646,7 @@ FUNK uint8x8_t vqtbx1_u8(uint8x8_t r, uint8x16_t tab, uint8x8_t idx) { return result; } -FUNK poly8x8_t vqtbx1_p8(poly8x8_t r, poly8x16_t tab, uint8x8_t idx) { +__funline poly8x8_t vqtbx1_p8(poly8x8_t r, poly8x16_t tab, uint8x8_t idx) { poly8x8_t result = r; __asm__("tbx %0.8b,{%1.16b},%2.8b" : "+w"(result) @@ -7638,7 +7655,7 @@ FUNK poly8x8_t vqtbx1_p8(poly8x8_t r, poly8x16_t tab, uint8x8_t idx) { return result; } -FUNK int8x16_t vqtbx1q_s8(int8x16_t r, int8x16_t tab, uint8x16_t idx) { +__funline int8x16_t vqtbx1q_s8(int8x16_t r, int8x16_t tab, uint8x16_t idx) { int8x16_t result = r; __asm__("tbx %0.16b,{%1.16b},%2.16b" : "+w"(result) @@ -7647,7 +7664,7 @@ FUNK int8x16_t vqtbx1q_s8(int8x16_t r, int8x16_t tab, uint8x16_t idx) { return result; } -FUNK uint8x16_t vqtbx1q_u8(uint8x16_t r, uint8x16_t tab, uint8x16_t idx) { +__funline uint8x16_t vqtbx1q_u8(uint8x16_t r, uint8x16_t tab, uint8x16_t idx) { uint8x16_t result = r; __asm__("tbx %0.16b,{%1.16b},%2.16b" : "+w"(result) @@ -7656,7 +7673,7 @@ FUNK uint8x16_t vqtbx1q_u8(uint8x16_t r, uint8x16_t tab, uint8x16_t idx) { return result; } -FUNK poly8x16_t vqtbx1q_p8(poly8x16_t r, poly8x16_t tab, uint8x16_t idx) { +__funline poly8x16_t vqtbx1q_p8(poly8x16_t r, poly8x16_t tab, uint8x16_t idx) { poly8x16_t result = r; __asm__("tbx %0.16b,{%1.16b},%2.16b" : "+w"(result) @@ -7665,7 +7682,7 @@ FUNK poly8x16_t vqtbx1q_p8(poly8x16_t r, poly8x16_t tab, uint8x16_t idx) { return result; } -FUNK int8x8_t vtbl1_s8(int8x8_t tab, int8x8_t idx) { +__funline int8x8_t vtbl1_s8(int8x8_t tab, int8x8_t idx) { int8x8_t result; int8x16_t temp = vcombine_s8(tab, vcreate_s8(__AARCH64_UINT64_C(0x0))); __asm__("tbl %0.8b, {%1.16b}, %2.8b" @@ -7675,7 +7692,7 @@ FUNK int8x8_t vtbl1_s8(int8x8_t tab, int8x8_t idx) { return result; } -FUNK uint8x8_t vtbl1_u8(uint8x8_t tab, uint8x8_t idx) { +__funline uint8x8_t vtbl1_u8(uint8x8_t tab, uint8x8_t idx) { uint8x8_t result; uint8x16_t temp = vcombine_u8(tab, vcreate_u8(__AARCH64_UINT64_C(0x0))); __asm__("tbl %0.8b, {%1.16b}, %2.8b" @@ -7685,7 +7702,7 @@ FUNK uint8x8_t vtbl1_u8(uint8x8_t tab, uint8x8_t idx) { return result; } -FUNK poly8x8_t vtbl1_p8(poly8x8_t tab, uint8x8_t idx) { +__funline poly8x8_t vtbl1_p8(poly8x8_t tab, uint8x8_t idx) { poly8x8_t result; poly8x16_t temp = vcombine_p8(tab, vcreate_p8(__AARCH64_UINT64_C(0x0))); __asm__("tbl %0.8b, {%1.16b}, %2.8b" @@ -7695,7 +7712,7 @@ FUNK poly8x8_t vtbl1_p8(poly8x8_t tab, uint8x8_t idx) { return result; } -FUNK int8x8_t vtbl2_s8(int8x8x2_t tab, int8x8_t idx) { +__funline int8x8_t vtbl2_s8(int8x8x2_t tab, int8x8_t idx) { int8x8_t result; int8x16_t temp = vcombine_s8(tab.val[0], tab.val[1]); __asm__("tbl %0.8b, {%1.16b}, %2.8b" @@ -7705,7 +7722,7 @@ FUNK int8x8_t vtbl2_s8(int8x8x2_t tab, int8x8_t idx) { return result; } -FUNK uint8x8_t vtbl2_u8(uint8x8x2_t tab, uint8x8_t idx) { +__funline uint8x8_t vtbl2_u8(uint8x8x2_t tab, uint8x8_t idx) { uint8x8_t result; uint8x16_t temp = vcombine_u8(tab.val[0], tab.val[1]); __asm__("tbl %0.8b, {%1.16b}, %2.8b" @@ -7715,7 +7732,7 @@ FUNK uint8x8_t vtbl2_u8(uint8x8x2_t tab, uint8x8_t idx) { return result; } -FUNK poly8x8_t vtbl2_p8(poly8x8x2_t tab, uint8x8_t idx) { +__funline poly8x8_t vtbl2_p8(poly8x8x2_t tab, uint8x8_t idx) { poly8x8_t result; poly8x16_t temp = vcombine_p8(tab.val[0], tab.val[1]); __asm__("tbl %0.8b, {%1.16b}, %2.8b" @@ -7725,7 +7742,7 @@ FUNK poly8x8_t vtbl2_p8(poly8x8x2_t tab, uint8x8_t idx) { return result; } -FUNK int8x8_t vtbl3_s8(int8x8x3_t tab, int8x8_t idx) { +__funline int8x8_t vtbl3_s8(int8x8x3_t tab, int8x8_t idx) { int8x8_t result; int8x16x2_t temp; __builtin_aarch64_simd_oi __o; @@ -7737,7 +7754,7 @@ FUNK int8x8_t vtbl3_s8(int8x8x3_t tab, int8x8_t idx) { return result; } -FUNK uint8x8_t vtbl3_u8(uint8x8x3_t tab, uint8x8_t idx) { +__funline uint8x8_t vtbl3_u8(uint8x8x3_t tab, uint8x8_t idx) { uint8x8_t result; uint8x16x2_t temp; __builtin_aarch64_simd_oi __o; @@ -7749,7 +7766,7 @@ FUNK uint8x8_t vtbl3_u8(uint8x8x3_t tab, uint8x8_t idx) { return result; } -FUNK poly8x8_t vtbl3_p8(poly8x8x3_t tab, uint8x8_t idx) { +__funline poly8x8_t vtbl3_p8(poly8x8x3_t tab, uint8x8_t idx) { poly8x8_t result; poly8x16x2_t temp; __builtin_aarch64_simd_oi __o; @@ -7761,7 +7778,7 @@ FUNK poly8x8_t vtbl3_p8(poly8x8x3_t tab, uint8x8_t idx) { return result; } -FUNK int8x8_t vtbl4_s8(int8x8x4_t tab, int8x8_t idx) { +__funline int8x8_t vtbl4_s8(int8x8x4_t tab, int8x8_t idx) { int8x8_t result; int8x16x2_t temp; __builtin_aarch64_simd_oi __o; @@ -7773,7 +7790,7 @@ FUNK int8x8_t vtbl4_s8(int8x8x4_t tab, int8x8_t idx) { return result; } -FUNK uint8x8_t vtbl4_u8(uint8x8x4_t tab, uint8x8_t idx) { +__funline uint8x8_t vtbl4_u8(uint8x8x4_t tab, uint8x8_t idx) { uint8x8_t result; uint8x16x2_t temp; __builtin_aarch64_simd_oi __o; @@ -7785,7 +7802,7 @@ FUNK uint8x8_t vtbl4_u8(uint8x8x4_t tab, uint8x8_t idx) { return result; } -FUNK poly8x8_t vtbl4_p8(poly8x8x4_t tab, uint8x8_t idx) { +__funline poly8x8_t vtbl4_p8(poly8x8x4_t tab, uint8x8_t idx) { poly8x8_t result; poly8x16x2_t temp; __builtin_aarch64_simd_oi __o; @@ -7797,7 +7814,7 @@ FUNK poly8x8_t vtbl4_p8(poly8x8x4_t tab, uint8x8_t idx) { return result; } -FUNK int8x8_t vtbx2_s8(int8x8_t r, int8x8x2_t tab, int8x8_t idx) { +__funline int8x8_t vtbx2_s8(int8x8_t r, int8x8x2_t tab, int8x8_t idx) { int8x8_t result = r; int8x16_t temp = vcombine_s8(tab.val[0], tab.val[1]); __asm__("tbx %0.8b, {%1.16b}, %2.8b" @@ -7807,7 +7824,7 @@ FUNK int8x8_t vtbx2_s8(int8x8_t r, int8x8x2_t tab, int8x8_t idx) { return result; } -FUNK uint8x8_t vtbx2_u8(uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx) { +__funline uint8x8_t vtbx2_u8(uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx) { uint8x8_t result = r; uint8x16_t temp = vcombine_u8(tab.val[0], tab.val[1]); __asm__("tbx %0.8b, {%1.16b}, %2.8b" @@ -7817,7 +7834,7 @@ FUNK uint8x8_t vtbx2_u8(uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx) { return result; } -FUNK poly8x8_t vtbx2_p8(poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx) { +__funline poly8x8_t vtbx2_p8(poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx) { poly8x8_t result = r; poly8x16_t temp = vcombine_p8(tab.val[0], tab.val[1]); __asm__("tbx %0.8b, {%1.16b}, %2.8b" @@ -7827,440 +7844,446 @@ FUNK poly8x8_t vtbx2_p8(poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx) { return result; } -FUNK float32_t vabds_f32(float32_t __a, float32_t __b) { +__funline float32_t vabds_f32(float32_t __a, float32_t __b) { return __builtin_aarch64_fabdsf(__a, __b); } -FUNK float64_t vabdd_f64(float64_t __a, float64_t __b) { +__funline float64_t vabdd_f64(float64_t __a, float64_t __b) { return __builtin_aarch64_fabddf(__a, __b); } -FUNK float32x2_t vabd_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vabd_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_fabdv2sf(__a, __b); } -FUNK float64x1_t vabd_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vabd_f64(float64x1_t __a, float64x1_t __b) { return (float64x1_t){vabdd_f64(vget_lane_f64(__a, 0), vget_lane_f64(__b, 0))}; } -FUNK float32x4_t vabdq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vabdq_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_fabdv4sf(__a, __b); } -FUNK float64x2_t vabdq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vabdq_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_fabdv2df(__a, __b); } -FUNK float32x2_t vabs_f32(float32x2_t __a) { +__funline float32x2_t vabs_f32(float32x2_t __a) { return __builtin_aarch64_absv2sf(__a); } -FUNK float64x1_t vabs_f64(float64x1_t __a) { +__funline float64x1_t vabs_f64(float64x1_t __a) { return (float64x1_t){__builtin_fabs(__a[0])}; } -FUNK int8x8_t vabs_s8(int8x8_t __a) { +__funline int8x8_t vabs_s8(int8x8_t __a) { return __builtin_aarch64_absv8qi(__a); } -FUNK int16x4_t vabs_s16(int16x4_t __a) { +__funline int16x4_t vabs_s16(int16x4_t __a) { return __builtin_aarch64_absv4hi(__a); } -FUNK int32x2_t vabs_s32(int32x2_t __a) { +__funline int32x2_t vabs_s32(int32x2_t __a) { return __builtin_aarch64_absv2si(__a); } -FUNK int64x1_t vabs_s64(int64x1_t __a) { +__funline int64x1_t vabs_s64(int64x1_t __a) { return (int64x1_t){__builtin_aarch64_absdi(__a[0])}; } -FUNK float32x4_t vabsq_f32(float32x4_t __a) { +__funline float32x4_t vabsq_f32(float32x4_t __a) { return __builtin_aarch64_absv4sf(__a); } -FUNK float64x2_t vabsq_f64(float64x2_t __a) { +__funline float64x2_t vabsq_f64(float64x2_t __a) { return __builtin_aarch64_absv2df(__a); } -FUNK int8x16_t vabsq_s8(int8x16_t __a) { +__funline int8x16_t vabsq_s8(int8x16_t __a) { return __builtin_aarch64_absv16qi(__a); } -FUNK int16x8_t vabsq_s16(int16x8_t __a) { +__funline int16x8_t vabsq_s16(int16x8_t __a) { return __builtin_aarch64_absv8hi(__a); } -FUNK int32x4_t vabsq_s32(int32x4_t __a) { +__funline int32x4_t vabsq_s32(int32x4_t __a) { return __builtin_aarch64_absv4si(__a); } -FUNK int64x2_t vabsq_s64(int64x2_t __a) { +__funline int64x2_t vabsq_s64(int64x2_t __a) { return __builtin_aarch64_absv2di(__a); } -FUNK int64_t vabsd_s64(int64_t __a) { +__funline int64_t vabsd_s64(int64_t __a) { return __a < 0 ? -(uint64_t)__a : __a; } -FUNK int64_t vaddd_s64(int64_t __a, int64_t __b) { +__funline int64_t vaddd_s64(int64_t __a, int64_t __b) { return __a + __b; } -FUNK uint64_t vaddd_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vaddd_u64(uint64_t __a, uint64_t __b) { return __a + __b; } -FUNK int8_t vaddv_s8(int8x8_t __a) { +__funline int8_t vaddv_s8(int8x8_t __a) { return __builtin_aarch64_reduc_plus_scal_v8qi(__a); } -FUNK int16_t vaddv_s16(int16x4_t __a) { +__funline int16_t vaddv_s16(int16x4_t __a) { return __builtin_aarch64_reduc_plus_scal_v4hi(__a); } -FUNK int32_t vaddv_s32(int32x2_t __a) { +__funline int32_t vaddv_s32(int32x2_t __a) { return __builtin_aarch64_reduc_plus_scal_v2si(__a); } -FUNK uint8_t vaddv_u8(uint8x8_t __a) { +__funline uint8_t vaddv_u8(uint8x8_t __a) { return (uint8_t)__builtin_aarch64_reduc_plus_scal_v8qi((int8x8_t)__a); } -FUNK uint16_t vaddv_u16(uint16x4_t __a) { +__funline uint16_t vaddv_u16(uint16x4_t __a) { return (uint16_t)__builtin_aarch64_reduc_plus_scal_v4hi((int16x4_t)__a); } -FUNK uint32_t vaddv_u32(uint32x2_t __a) { +__funline uint32_t vaddv_u32(uint32x2_t __a) { return (int32_t)__builtin_aarch64_reduc_plus_scal_v2si((int32x2_t)__a); } -FUNK int8_t vaddvq_s8(int8x16_t __a) { +__funline int8_t vaddvq_s8(int8x16_t __a) { return __builtin_aarch64_reduc_plus_scal_v16qi(__a); } -FUNK int16_t vaddvq_s16(int16x8_t __a) { +__funline int16_t vaddvq_s16(int16x8_t __a) { return __builtin_aarch64_reduc_plus_scal_v8hi(__a); } -FUNK int32_t vaddvq_s32(int32x4_t __a) { +__funline int32_t vaddvq_s32(int32x4_t __a) { return __builtin_aarch64_reduc_plus_scal_v4si(__a); } -FUNK int64_t vaddvq_s64(int64x2_t __a) { +__funline int64_t vaddvq_s64(int64x2_t __a) { return __builtin_aarch64_reduc_plus_scal_v2di(__a); } -FUNK uint8_t vaddvq_u8(uint8x16_t __a) { +__funline uint8_t vaddvq_u8(uint8x16_t __a) { return (uint8_t)__builtin_aarch64_reduc_plus_scal_v16qi((int8x16_t)__a); } -FUNK uint16_t vaddvq_u16(uint16x8_t __a) { +__funline uint16_t vaddvq_u16(uint16x8_t __a) { return (uint16_t)__builtin_aarch64_reduc_plus_scal_v8hi((int16x8_t)__a); } -FUNK uint32_t vaddvq_u32(uint32x4_t __a) { +__funline uint32_t vaddvq_u32(uint32x4_t __a) { return (uint32_t)__builtin_aarch64_reduc_plus_scal_v4si((int32x4_t)__a); } -FUNK uint64_t vaddvq_u64(uint64x2_t __a) { +__funline uint64_t vaddvq_u64(uint64x2_t __a) { return (uint64_t)__builtin_aarch64_reduc_plus_scal_v2di((int64x2_t)__a); } -FUNK float32_t vaddv_f32(float32x2_t __a) { +__funline float32_t vaddv_f32(float32x2_t __a) { return __builtin_aarch64_reduc_plus_scal_v2sf(__a); } -FUNK float32_t vaddvq_f32(float32x4_t __a) { +__funline float32_t vaddvq_f32(float32x4_t __a) { return __builtin_aarch64_reduc_plus_scal_v4sf(__a); } -FUNK float64_t vaddvq_f64(float64x2_t __a) { +__funline float64_t vaddvq_f64(float64x2_t __a) { return __builtin_aarch64_reduc_plus_scal_v2df(__a); } -FUNK float16x4_t vbsl_f16(uint16x4_t __a, float16x4_t __b, float16x4_t __c) { +__funline float16x4_t vbsl_f16(uint16x4_t __a, float16x4_t __b, + float16x4_t __c) { return __builtin_aarch64_simd_bslv4hf_suss(__a, __b, __c); } -FUNK float32x2_t vbsl_f32(uint32x2_t __a, float32x2_t __b, float32x2_t __c) { +__funline float32x2_t vbsl_f32(uint32x2_t __a, float32x2_t __b, + float32x2_t __c) { return __builtin_aarch64_simd_bslv2sf_suss(__a, __b, __c); } -FUNK float64x1_t vbsl_f64(uint64x1_t __a, float64x1_t __b, float64x1_t __c) { +__funline float64x1_t vbsl_f64(uint64x1_t __a, float64x1_t __b, + float64x1_t __c) { return (float64x1_t){ __builtin_aarch64_simd_bsldf_suss(__a[0], __b[0], __c[0])}; } -FUNK poly8x8_t vbsl_p8(uint8x8_t __a, poly8x8_t __b, poly8x8_t __c) { +__funline poly8x8_t vbsl_p8(uint8x8_t __a, poly8x8_t __b, poly8x8_t __c) { return __builtin_aarch64_simd_bslv8qi_pupp(__a, __b, __c); } -FUNK poly16x4_t vbsl_p16(uint16x4_t __a, poly16x4_t __b, poly16x4_t __c) { +__funline poly16x4_t vbsl_p16(uint16x4_t __a, poly16x4_t __b, poly16x4_t __c) { return __builtin_aarch64_simd_bslv4hi_pupp(__a, __b, __c); } -FUNK poly64x1_t vbsl_p64(uint64x1_t __a, poly64x1_t __b, poly64x1_t __c) { +__funline poly64x1_t vbsl_p64(uint64x1_t __a, poly64x1_t __b, poly64x1_t __c) { return (poly64x1_t){ __builtin_aarch64_simd_bsldi_pupp(__a[0], __b[0], __c[0])}; } -FUNK int8x8_t vbsl_s8(uint8x8_t __a, int8x8_t __b, int8x8_t __c) { +__funline int8x8_t vbsl_s8(uint8x8_t __a, int8x8_t __b, int8x8_t __c) { return __builtin_aarch64_simd_bslv8qi_suss(__a, __b, __c); } -FUNK int16x4_t vbsl_s16(uint16x4_t __a, int16x4_t __b, int16x4_t __c) { +__funline int16x4_t vbsl_s16(uint16x4_t __a, int16x4_t __b, int16x4_t __c) { return __builtin_aarch64_simd_bslv4hi_suss(__a, __b, __c); } -FUNK int32x2_t vbsl_s32(uint32x2_t __a, int32x2_t __b, int32x2_t __c) { +__funline int32x2_t vbsl_s32(uint32x2_t __a, int32x2_t __b, int32x2_t __c) { return __builtin_aarch64_simd_bslv2si_suss(__a, __b, __c); } -FUNK int64x1_t vbsl_s64(uint64x1_t __a, int64x1_t __b, int64x1_t __c) { +__funline int64x1_t vbsl_s64(uint64x1_t __a, int64x1_t __b, int64x1_t __c) { return (int64x1_t){__builtin_aarch64_simd_bsldi_suss(__a[0], __b[0], __c[0])}; } -FUNK uint8x8_t vbsl_u8(uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) { +__funline uint8x8_t vbsl_u8(uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) { return __builtin_aarch64_simd_bslv8qi_uuuu(__a, __b, __c); } -FUNK uint16x4_t vbsl_u16(uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) { +__funline uint16x4_t vbsl_u16(uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) { return __builtin_aarch64_simd_bslv4hi_uuuu(__a, __b, __c); } -FUNK uint32x2_t vbsl_u32(uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) { +__funline uint32x2_t vbsl_u32(uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) { return __builtin_aarch64_simd_bslv2si_uuuu(__a, __b, __c); } -FUNK uint64x1_t vbsl_u64(uint64x1_t __a, uint64x1_t __b, uint64x1_t __c) { +__funline uint64x1_t vbsl_u64(uint64x1_t __a, uint64x1_t __b, uint64x1_t __c) { return (uint64x1_t){ __builtin_aarch64_simd_bsldi_uuuu(__a[0], __b[0], __c[0])}; } -FUNK float16x8_t vbslq_f16(uint16x8_t __a, float16x8_t __b, float16x8_t __c) { +__funline float16x8_t vbslq_f16(uint16x8_t __a, float16x8_t __b, + float16x8_t __c) { return __builtin_aarch64_simd_bslv8hf_suss(__a, __b, __c); } -FUNK float32x4_t vbslq_f32(uint32x4_t __a, float32x4_t __b, float32x4_t __c) { +__funline float32x4_t vbslq_f32(uint32x4_t __a, float32x4_t __b, + float32x4_t __c) { return __builtin_aarch64_simd_bslv4sf_suss(__a, __b, __c); } -FUNK float64x2_t vbslq_f64(uint64x2_t __a, float64x2_t __b, float64x2_t __c) { +__funline float64x2_t vbslq_f64(uint64x2_t __a, float64x2_t __b, + float64x2_t __c) { return __builtin_aarch64_simd_bslv2df_suss(__a, __b, __c); } -FUNK poly8x16_t vbslq_p8(uint8x16_t __a, poly8x16_t __b, poly8x16_t __c) { +__funline poly8x16_t vbslq_p8(uint8x16_t __a, poly8x16_t __b, poly8x16_t __c) { return __builtin_aarch64_simd_bslv16qi_pupp(__a, __b, __c); } -FUNK poly16x8_t vbslq_p16(uint16x8_t __a, poly16x8_t __b, poly16x8_t __c) { +__funline poly16x8_t vbslq_p16(uint16x8_t __a, poly16x8_t __b, poly16x8_t __c) { return __builtin_aarch64_simd_bslv8hi_pupp(__a, __b, __c); } -FUNK int8x16_t vbslq_s8(uint8x16_t __a, int8x16_t __b, int8x16_t __c) { +__funline int8x16_t vbslq_s8(uint8x16_t __a, int8x16_t __b, int8x16_t __c) { return __builtin_aarch64_simd_bslv16qi_suss(__a, __b, __c); } -FUNK int16x8_t vbslq_s16(uint16x8_t __a, int16x8_t __b, int16x8_t __c) { +__funline int16x8_t vbslq_s16(uint16x8_t __a, int16x8_t __b, int16x8_t __c) { return __builtin_aarch64_simd_bslv8hi_suss(__a, __b, __c); } -FUNK poly64x2_t vbslq_p64(uint64x2_t __a, poly64x2_t __b, poly64x2_t __c) { +__funline poly64x2_t vbslq_p64(uint64x2_t __a, poly64x2_t __b, poly64x2_t __c) { return __builtin_aarch64_simd_bslv2di_pupp(__a, __b, __c); } -FUNK int32x4_t vbslq_s32(uint32x4_t __a, int32x4_t __b, int32x4_t __c) { +__funline int32x4_t vbslq_s32(uint32x4_t __a, int32x4_t __b, int32x4_t __c) { return __builtin_aarch64_simd_bslv4si_suss(__a, __b, __c); } -FUNK int64x2_t vbslq_s64(uint64x2_t __a, int64x2_t __b, int64x2_t __c) { +__funline int64x2_t vbslq_s64(uint64x2_t __a, int64x2_t __b, int64x2_t __c) { return __builtin_aarch64_simd_bslv2di_suss(__a, __b, __c); } -FUNK uint8x16_t vbslq_u8(uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { +__funline uint8x16_t vbslq_u8(uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { return __builtin_aarch64_simd_bslv16qi_uuuu(__a, __b, __c); } -FUNK uint16x8_t vbslq_u16(uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { +__funline uint16x8_t vbslq_u16(uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { return __builtin_aarch64_simd_bslv8hi_uuuu(__a, __b, __c); } -FUNK uint32x4_t vbslq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { +__funline uint32x4_t vbslq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { return __builtin_aarch64_simd_bslv4si_uuuu(__a, __b, __c); } -FUNK uint64x2_t vbslq_u64(uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { +__funline uint64x2_t vbslq_u64(uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { return __builtin_aarch64_simd_bslv2di_uuuu(__a, __b, __c); } #pragma GCC push_options #pragma GCC target("+nothing+rdma") -FUNK int16x4_t vqrdmlah_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c) { +__funline int16x4_t vqrdmlah_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c) { return __builtin_aarch64_sqrdmlahv4hi(__a, __b, __c); } -FUNK int32x2_t vqrdmlah_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c) { +__funline int32x2_t vqrdmlah_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c) { return __builtin_aarch64_sqrdmlahv2si(__a, __b, __c); } -FUNK int16x8_t vqrdmlahq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) { +__funline int16x8_t vqrdmlahq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) { return __builtin_aarch64_sqrdmlahv8hi(__a, __b, __c); } -FUNK int32x4_t vqrdmlahq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) { +__funline int32x4_t vqrdmlahq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) { return __builtin_aarch64_sqrdmlahv4si(__a, __b, __c); } -FUNK int16x4_t vqrdmlsh_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c) { +__funline int16x4_t vqrdmlsh_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c) { return __builtin_aarch64_sqrdmlshv4hi(__a, __b, __c); } -FUNK int32x2_t vqrdmlsh_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c) { +__funline int32x2_t vqrdmlsh_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c) { return __builtin_aarch64_sqrdmlshv2si(__a, __b, __c); } -FUNK int16x8_t vqrdmlshq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) { +__funline int16x8_t vqrdmlshq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) { return __builtin_aarch64_sqrdmlshv8hi(__a, __b, __c); } -FUNK int32x4_t vqrdmlshq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) { +__funline int32x4_t vqrdmlshq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) { return __builtin_aarch64_sqrdmlshv4si(__a, __b, __c); } -FUNK int16x4_t vqrdmlah_laneq_s16(int16x4_t __a, int16x4_t __b, int16x8_t __c, - const int __d) { +__funline int16x4_t vqrdmlah_laneq_s16(int16x4_t __a, int16x4_t __b, + int16x8_t __c, const int __d) { return __builtin_aarch64_sqrdmlah_laneqv4hi(__a, __b, __c, __d); } -FUNK int32x2_t vqrdmlah_laneq_s32(int32x2_t __a, int32x2_t __b, int32x4_t __c, - const int __d) { +__funline int32x2_t vqrdmlah_laneq_s32(int32x2_t __a, int32x2_t __b, + int32x4_t __c, const int __d) { return __builtin_aarch64_sqrdmlah_laneqv2si(__a, __b, __c, __d); } -FUNK int16x8_t vqrdmlahq_laneq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c, - const int __d) { +__funline int16x8_t vqrdmlahq_laneq_s16(int16x8_t __a, int16x8_t __b, + int16x8_t __c, const int __d) { return __builtin_aarch64_sqrdmlah_laneqv8hi(__a, __b, __c, __d); } -FUNK int32x4_t vqrdmlahq_laneq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c, - const int __d) { +__funline int32x4_t vqrdmlahq_laneq_s32(int32x4_t __a, int32x4_t __b, + int32x4_t __c, const int __d) { return __builtin_aarch64_sqrdmlah_laneqv4si(__a, __b, __c, __d); } -FUNK int16x4_t vqrdmlsh_laneq_s16(int16x4_t __a, int16x4_t __b, int16x8_t __c, - const int __d) { +__funline int16x4_t vqrdmlsh_laneq_s16(int16x4_t __a, int16x4_t __b, + int16x8_t __c, const int __d) { return __builtin_aarch64_sqrdmlsh_laneqv4hi(__a, __b, __c, __d); } -FUNK int32x2_t vqrdmlsh_laneq_s32(int32x2_t __a, int32x2_t __b, int32x4_t __c, - const int __d) { +__funline int32x2_t vqrdmlsh_laneq_s32(int32x2_t __a, int32x2_t __b, + int32x4_t __c, const int __d) { return __builtin_aarch64_sqrdmlsh_laneqv2si(__a, __b, __c, __d); } -FUNK int16x8_t vqrdmlshq_laneq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c, - const int __d) { +__funline int16x8_t vqrdmlshq_laneq_s16(int16x8_t __a, int16x8_t __b, + int16x8_t __c, const int __d) { return __builtin_aarch64_sqrdmlsh_laneqv8hi(__a, __b, __c, __d); } -FUNK int32x4_t vqrdmlshq_laneq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c, - const int __d) { +__funline int32x4_t vqrdmlshq_laneq_s32(int32x4_t __a, int32x4_t __b, + int32x4_t __c, const int __d) { return __builtin_aarch64_sqrdmlsh_laneqv4si(__a, __b, __c, __d); } -FUNK int16x4_t vqrdmlah_lane_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c, - const int __d) { +__funline int16x4_t vqrdmlah_lane_s16(int16x4_t __a, int16x4_t __b, + int16x4_t __c, const int __d) { return __builtin_aarch64_sqrdmlah_lanev4hi(__a, __b, __c, __d); } -FUNK int32x2_t vqrdmlah_lane_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c, - const int __d) { +__funline int32x2_t vqrdmlah_lane_s32(int32x2_t __a, int32x2_t __b, + int32x2_t __c, const int __d) { return __builtin_aarch64_sqrdmlah_lanev2si(__a, __b, __c, __d); } -FUNK int16x8_t vqrdmlahq_lane_s16(int16x8_t __a, int16x8_t __b, int16x4_t __c, - const int __d) { +__funline int16x8_t vqrdmlahq_lane_s16(int16x8_t __a, int16x8_t __b, + int16x4_t __c, const int __d) { return __builtin_aarch64_sqrdmlah_lanev8hi(__a, __b, __c, __d); } -FUNK int32x4_t vqrdmlahq_lane_s32(int32x4_t __a, int32x4_t __b, int32x2_t __c, - const int __d) { +__funline int32x4_t vqrdmlahq_lane_s32(int32x4_t __a, int32x4_t __b, + int32x2_t __c, const int __d) { return __builtin_aarch64_sqrdmlah_lanev4si(__a, __b, __c, __d); } -FUNK int16_t vqrdmlahh_s16(int16_t __a, int16_t __b, int16_t __c) { +__funline int16_t vqrdmlahh_s16(int16_t __a, int16_t __b, int16_t __c) { return (int16_t)__builtin_aarch64_sqrdmlahhi(__a, __b, __c); } -FUNK int16_t vqrdmlahh_lane_s16(int16_t __a, int16_t __b, int16x4_t __c, - const int __d) { +__funline int16_t vqrdmlahh_lane_s16(int16_t __a, int16_t __b, int16x4_t __c, + const int __d) { return __builtin_aarch64_sqrdmlah_lanehi(__a, __b, __c, __d); } -FUNK int16_t vqrdmlahh_laneq_s16(int16_t __a, int16_t __b, int16x8_t __c, - const int __d) { +__funline int16_t vqrdmlahh_laneq_s16(int16_t __a, int16_t __b, int16x8_t __c, + const int __d) { return __builtin_aarch64_sqrdmlah_laneqhi(__a, __b, __c, __d); } -FUNK int32_t vqrdmlahs_s32(int32_t __a, int32_t __b, int32_t __c) { +__funline int32_t vqrdmlahs_s32(int32_t __a, int32_t __b, int32_t __c) { return (int32_t)__builtin_aarch64_sqrdmlahsi(__a, __b, __c); } -FUNK int32_t vqrdmlahs_lane_s32(int32_t __a, int32_t __b, int32x2_t __c, - const int __d) { +__funline int32_t vqrdmlahs_lane_s32(int32_t __a, int32_t __b, int32x2_t __c, + const int __d) { return __builtin_aarch64_sqrdmlah_lanesi(__a, __b, __c, __d); } -FUNK int32_t vqrdmlahs_laneq_s32(int32_t __a, int32_t __b, int32x4_t __c, - const int __d) { +__funline int32_t vqrdmlahs_laneq_s32(int32_t __a, int32_t __b, int32x4_t __c, + const int __d) { return __builtin_aarch64_sqrdmlah_laneqsi(__a, __b, __c, __d); } -FUNK int16x4_t vqrdmlsh_lane_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c, - const int __d) { +__funline int16x4_t vqrdmlsh_lane_s16(int16x4_t __a, int16x4_t __b, + int16x4_t __c, const int __d) { return __builtin_aarch64_sqrdmlsh_lanev4hi(__a, __b, __c, __d); } -FUNK int32x2_t vqrdmlsh_lane_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c, - const int __d) { +__funline int32x2_t vqrdmlsh_lane_s32(int32x2_t __a, int32x2_t __b, + int32x2_t __c, const int __d) { return __builtin_aarch64_sqrdmlsh_lanev2si(__a, __b, __c, __d); } -FUNK int16x8_t vqrdmlshq_lane_s16(int16x8_t __a, int16x8_t __b, int16x4_t __c, - const int __d) { +__funline int16x8_t vqrdmlshq_lane_s16(int16x8_t __a, int16x8_t __b, + int16x4_t __c, const int __d) { return __builtin_aarch64_sqrdmlsh_lanev8hi(__a, __b, __c, __d); } -FUNK int32x4_t vqrdmlshq_lane_s32(int32x4_t __a, int32x4_t __b, int32x2_t __c, - const int __d) { +__funline int32x4_t vqrdmlshq_lane_s32(int32x4_t __a, int32x4_t __b, + int32x2_t __c, const int __d) { return __builtin_aarch64_sqrdmlsh_lanev4si(__a, __b, __c, __d); } -FUNK int16_t vqrdmlshh_s16(int16_t __a, int16_t __b, int16_t __c) { +__funline int16_t vqrdmlshh_s16(int16_t __a, int16_t __b, int16_t __c) { return (int16_t)__builtin_aarch64_sqrdmlshhi(__a, __b, __c); } -FUNK int16_t vqrdmlshh_lane_s16(int16_t __a, int16_t __b, int16x4_t __c, - const int __d) { +__funline int16_t vqrdmlshh_lane_s16(int16_t __a, int16_t __b, int16x4_t __c, + const int __d) { return __builtin_aarch64_sqrdmlsh_lanehi(__a, __b, __c, __d); } -FUNK int16_t vqrdmlshh_laneq_s16(int16_t __a, int16_t __b, int16x8_t __c, - const int __d) { +__funline int16_t vqrdmlshh_laneq_s16(int16_t __a, int16_t __b, int16x8_t __c, + const int __d) { return __builtin_aarch64_sqrdmlsh_laneqhi(__a, __b, __c, __d); } -FUNK int32_t vqrdmlshs_s32(int32_t __a, int32_t __b, int32_t __c) { +__funline int32_t vqrdmlshs_s32(int32_t __a, int32_t __b, int32_t __c) { return (int32_t)__builtin_aarch64_sqrdmlshsi(__a, __b, __c); } -FUNK int32_t vqrdmlshs_lane_s32(int32_t __a, int32_t __b, int32x2_t __c, - const int __d) { +__funline int32_t vqrdmlshs_lane_s32(int32_t __a, int32_t __b, int32x2_t __c, + const int __d) { return __builtin_aarch64_sqrdmlsh_lanesi(__a, __b, __c, __d); } -FUNK int32_t vqrdmlshs_laneq_s32(int32_t __a, int32_t __b, int32x4_t __c, - const int __d) { +__funline int32_t vqrdmlshs_laneq_s32(int32_t __a, int32_t __b, int32x4_t __c, + const int __d) { return __builtin_aarch64_sqrdmlsh_laneqsi(__a, __b, __c, __d); } @@ -8269,2229 +8292,2230 @@ FUNK int32_t vqrdmlshs_laneq_s32(int32_t __a, int32_t __b, int32x4_t __c, #pragma GCC push_options #pragma GCC target("+nothing+crypto") -FUNK uint8x16_t vaeseq_u8(uint8x16_t data, uint8x16_t key) { +__funline uint8x16_t vaeseq_u8(uint8x16_t data, uint8x16_t key) { return __builtin_aarch64_crypto_aesev16qi_uuu(data, key); } -FUNK uint8x16_t vaesdq_u8(uint8x16_t data, uint8x16_t key) { +__funline uint8x16_t vaesdq_u8(uint8x16_t data, uint8x16_t key) { return __builtin_aarch64_crypto_aesdv16qi_uuu(data, key); } -FUNK uint8x16_t vaesmcq_u8(uint8x16_t data) { +__funline uint8x16_t vaesmcq_u8(uint8x16_t data) { return __builtin_aarch64_crypto_aesmcv16qi_uu(data); } -FUNK uint8x16_t vaesimcq_u8(uint8x16_t data) { +__funline uint8x16_t vaesimcq_u8(uint8x16_t data) { return __builtin_aarch64_crypto_aesimcv16qi_uu(data); } #pragma GCC pop_options -FUNK uint64x1_t vcage_f64(float64x1_t __a, float64x1_t __b) { +__funline uint64x1_t vcage_f64(float64x1_t __a, float64x1_t __b) { return vabs_f64(__a) >= vabs_f64(__b); } -FUNK uint32_t vcages_f32(float32_t __a, float32_t __b) { +__funline uint32_t vcages_f32(float32_t __a, float32_t __b) { return __builtin_fabsf(__a) >= __builtin_fabsf(__b) ? -1 : 0; } -FUNK uint32x2_t vcage_f32(float32x2_t __a, float32x2_t __b) { +__funline uint32x2_t vcage_f32(float32x2_t __a, float32x2_t __b) { return vabs_f32(__a) >= vabs_f32(__b); } -FUNK uint32x4_t vcageq_f32(float32x4_t __a, float32x4_t __b) { +__funline uint32x4_t vcageq_f32(float32x4_t __a, float32x4_t __b) { return vabsq_f32(__a) >= vabsq_f32(__b); } -FUNK uint64_t vcaged_f64(float64_t __a, float64_t __b) { +__funline uint64_t vcaged_f64(float64_t __a, float64_t __b) { return __builtin_fabs(__a) >= __builtin_fabs(__b) ? -1 : 0; } -FUNK uint64x2_t vcageq_f64(float64x2_t __a, float64x2_t __b) { +__funline uint64x2_t vcageq_f64(float64x2_t __a, float64x2_t __b) { return vabsq_f64(__a) >= vabsq_f64(__b); } -FUNK uint32_t vcagts_f32(float32_t __a, float32_t __b) { +__funline uint32_t vcagts_f32(float32_t __a, float32_t __b) { return __builtin_fabsf(__a) > __builtin_fabsf(__b) ? -1 : 0; } -FUNK uint32x2_t vcagt_f32(float32x2_t __a, float32x2_t __b) { +__funline uint32x2_t vcagt_f32(float32x2_t __a, float32x2_t __b) { return vabs_f32(__a) > vabs_f32(__b); } -FUNK uint64x1_t vcagt_f64(float64x1_t __a, float64x1_t __b) { +__funline uint64x1_t vcagt_f64(float64x1_t __a, float64x1_t __b) { return vabs_f64(__a) > vabs_f64(__b); } -FUNK uint32x4_t vcagtq_f32(float32x4_t __a, float32x4_t __b) { +__funline uint32x4_t vcagtq_f32(float32x4_t __a, float32x4_t __b) { return vabsq_f32(__a) > vabsq_f32(__b); } -FUNK uint64_t vcagtd_f64(float64_t __a, float64_t __b) { +__funline uint64_t vcagtd_f64(float64_t __a, float64_t __b) { return __builtin_fabs(__a) > __builtin_fabs(__b) ? -1 : 0; } -FUNK uint64x2_t vcagtq_f64(float64x2_t __a, float64x2_t __b) { +__funline uint64x2_t vcagtq_f64(float64x2_t __a, float64x2_t __b) { return vabsq_f64(__a) > vabsq_f64(__b); } -FUNK uint32x2_t vcale_f32(float32x2_t __a, float32x2_t __b) { +__funline uint32x2_t vcale_f32(float32x2_t __a, float32x2_t __b) { return vabs_f32(__a) <= vabs_f32(__b); } -FUNK uint64x1_t vcale_f64(float64x1_t __a, float64x1_t __b) { +__funline uint64x1_t vcale_f64(float64x1_t __a, float64x1_t __b) { return vabs_f64(__a) <= vabs_f64(__b); } -FUNK uint64_t vcaled_f64(float64_t __a, float64_t __b) { +__funline uint64_t vcaled_f64(float64_t __a, float64_t __b) { return __builtin_fabs(__a) <= __builtin_fabs(__b) ? -1 : 0; } -FUNK uint32_t vcales_f32(float32_t __a, float32_t __b) { +__funline uint32_t vcales_f32(float32_t __a, float32_t __b) { return __builtin_fabsf(__a) <= __builtin_fabsf(__b) ? -1 : 0; } -FUNK uint32x4_t vcaleq_f32(float32x4_t __a, float32x4_t __b) { +__funline uint32x4_t vcaleq_f32(float32x4_t __a, float32x4_t __b) { return vabsq_f32(__a) <= vabsq_f32(__b); } -FUNK uint64x2_t vcaleq_f64(float64x2_t __a, float64x2_t __b) { +__funline uint64x2_t vcaleq_f64(float64x2_t __a, float64x2_t __b) { return vabsq_f64(__a) <= vabsq_f64(__b); } -FUNK uint32x2_t vcalt_f32(float32x2_t __a, float32x2_t __b) { +__funline uint32x2_t vcalt_f32(float32x2_t __a, float32x2_t __b) { return vabs_f32(__a) < vabs_f32(__b); } -FUNK uint64x1_t vcalt_f64(float64x1_t __a, float64x1_t __b) { +__funline uint64x1_t vcalt_f64(float64x1_t __a, float64x1_t __b) { return vabs_f64(__a) < vabs_f64(__b); } -FUNK uint64_t vcaltd_f64(float64_t __a, float64_t __b) { +__funline uint64_t vcaltd_f64(float64_t __a, float64_t __b) { return __builtin_fabs(__a) < __builtin_fabs(__b) ? -1 : 0; } -FUNK uint32x4_t vcaltq_f32(float32x4_t __a, float32x4_t __b) { +__funline uint32x4_t vcaltq_f32(float32x4_t __a, float32x4_t __b) { return vabsq_f32(__a) < vabsq_f32(__b); } -FUNK uint64x2_t vcaltq_f64(float64x2_t __a, float64x2_t __b) { +__funline uint64x2_t vcaltq_f64(float64x2_t __a, float64x2_t __b) { return vabsq_f64(__a) < vabsq_f64(__b); } -FUNK uint32_t vcalts_f32(float32_t __a, float32_t __b) { +__funline uint32_t vcalts_f32(float32_t __a, float32_t __b) { return __builtin_fabsf(__a) < __builtin_fabsf(__b) ? -1 : 0; } -FUNK uint32x2_t vceq_f32(float32x2_t __a, float32x2_t __b) { +__funline uint32x2_t vceq_f32(float32x2_t __a, float32x2_t __b) { return (uint32x2_t)(__a == __b); } -FUNK uint64x1_t vceq_f64(float64x1_t __a, float64x1_t __b) { +__funline uint64x1_t vceq_f64(float64x1_t __a, float64x1_t __b) { return (uint64x1_t)(__a == __b); } -FUNK uint8x8_t vceq_p8(poly8x8_t __a, poly8x8_t __b) { +__funline uint8x8_t vceq_p8(poly8x8_t __a, poly8x8_t __b) { return (uint8x8_t)(__a == __b); } -FUNK uint64x1_t vceq_p64(poly64x1_t __a, poly64x1_t __b) { +__funline uint64x1_t vceq_p64(poly64x1_t __a, poly64x1_t __b) { return (uint64x1_t)(__a == __b); } -FUNK uint8x8_t vceq_s8(int8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vceq_s8(int8x8_t __a, int8x8_t __b) { return (uint8x8_t)(__a == __b); } -FUNK uint16x4_t vceq_s16(int16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vceq_s16(int16x4_t __a, int16x4_t __b) { return (uint16x4_t)(__a == __b); } -FUNK uint32x2_t vceq_s32(int32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vceq_s32(int32x2_t __a, int32x2_t __b) { return (uint32x2_t)(__a == __b); } -FUNK uint64x1_t vceq_s64(int64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vceq_s64(int64x1_t __a, int64x1_t __b) { return (uint64x1_t)(__a == __b); } -FUNK uint8x8_t vceq_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vceq_u8(uint8x8_t __a, uint8x8_t __b) { return (__a == __b); } -FUNK uint16x4_t vceq_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vceq_u16(uint16x4_t __a, uint16x4_t __b) { return (__a == __b); } -FUNK uint32x2_t vceq_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vceq_u32(uint32x2_t __a, uint32x2_t __b) { return (__a == __b); } -FUNK uint64x1_t vceq_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vceq_u64(uint64x1_t __a, uint64x1_t __b) { return (__a == __b); } -FUNK uint32x4_t vceqq_f32(float32x4_t __a, float32x4_t __b) { +__funline uint32x4_t vceqq_f32(float32x4_t __a, float32x4_t __b) { return (uint32x4_t)(__a == __b); } -FUNK uint64x2_t vceqq_f64(float64x2_t __a, float64x2_t __b) { +__funline uint64x2_t vceqq_f64(float64x2_t __a, float64x2_t __b) { return (uint64x2_t)(__a == __b); } -FUNK uint8x16_t vceqq_p8(poly8x16_t __a, poly8x16_t __b) { +__funline uint8x16_t vceqq_p8(poly8x16_t __a, poly8x16_t __b) { return (uint8x16_t)(__a == __b); } -FUNK uint8x16_t vceqq_s8(int8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vceqq_s8(int8x16_t __a, int8x16_t __b) { return (uint8x16_t)(__a == __b); } -FUNK uint16x8_t vceqq_s16(int16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vceqq_s16(int16x8_t __a, int16x8_t __b) { return (uint16x8_t)(__a == __b); } -FUNK uint32x4_t vceqq_s32(int32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vceqq_s32(int32x4_t __a, int32x4_t __b) { return (uint32x4_t)(__a == __b); } -FUNK uint64x2_t vceqq_s64(int64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vceqq_s64(int64x2_t __a, int64x2_t __b) { return (uint64x2_t)(__a == __b); } -FUNK uint8x16_t vceqq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vceqq_u8(uint8x16_t __a, uint8x16_t __b) { return (__a == __b); } -FUNK uint16x8_t vceqq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vceqq_u16(uint16x8_t __a, uint16x8_t __b) { return (__a == __b); } -FUNK uint32x4_t vceqq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vceqq_u32(uint32x4_t __a, uint32x4_t __b) { return (__a == __b); } -FUNK uint64x2_t vceqq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vceqq_u64(uint64x2_t __a, uint64x2_t __b) { return (__a == __b); } -FUNK uint32_t vceqs_f32(float32_t __a, float32_t __b) { +__funline uint32_t vceqs_f32(float32_t __a, float32_t __b) { return __a == __b ? -1 : 0; } -FUNK uint64_t vceqd_s64(int64_t __a, int64_t __b) { +__funline uint64_t vceqd_s64(int64_t __a, int64_t __b) { return __a == __b ? -1ll : 0ll; } -FUNK uint64_t vceqd_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vceqd_u64(uint64_t __a, uint64_t __b) { return __a == __b ? -1ll : 0ll; } -FUNK uint64_t vceqd_f64(float64_t __a, float64_t __b) { +__funline uint64_t vceqd_f64(float64_t __a, float64_t __b) { return __a == __b ? -1ll : 0ll; } -FUNK uint32x2_t vceqz_f32(float32x2_t __a) { +__funline uint32x2_t vceqz_f32(float32x2_t __a) { return (uint32x2_t)(__a == 0.0f); } -FUNK uint64x1_t vceqz_f64(float64x1_t __a) { +__funline uint64x1_t vceqz_f64(float64x1_t __a) { return (uint64x1_t)(__a == (float64x1_t){0.0}); } -FUNK uint8x8_t vceqz_p8(poly8x8_t __a) { +__funline uint8x8_t vceqz_p8(poly8x8_t __a) { return (uint8x8_t)(__a == 0); } -FUNK uint8x8_t vceqz_s8(int8x8_t __a) { +__funline uint8x8_t vceqz_s8(int8x8_t __a) { return (uint8x8_t)(__a == 0); } -FUNK uint16x4_t vceqz_s16(int16x4_t __a) { +__funline uint16x4_t vceqz_s16(int16x4_t __a) { return (uint16x4_t)(__a == 0); } -FUNK uint32x2_t vceqz_s32(int32x2_t __a) { +__funline uint32x2_t vceqz_s32(int32x2_t __a) { return (uint32x2_t)(__a == 0); } -FUNK uint64x1_t vceqz_s64(int64x1_t __a) { +__funline uint64x1_t vceqz_s64(int64x1_t __a) { return (uint64x1_t)(__a == __AARCH64_INT64_C(0)); } -FUNK uint8x8_t vceqz_u8(uint8x8_t __a) { +__funline uint8x8_t vceqz_u8(uint8x8_t __a) { return (__a == 0); } -FUNK uint16x4_t vceqz_u16(uint16x4_t __a) { +__funline uint16x4_t vceqz_u16(uint16x4_t __a) { return (__a == 0); } -FUNK uint32x2_t vceqz_u32(uint32x2_t __a) { +__funline uint32x2_t vceqz_u32(uint32x2_t __a) { return (__a == 0); } -FUNK uint64x1_t vceqz_u64(uint64x1_t __a) { +__funline uint64x1_t vceqz_u64(uint64x1_t __a) { return (__a == __AARCH64_UINT64_C(0)); } -FUNK uint32x4_t vceqzq_f32(float32x4_t __a) { +__funline uint32x4_t vceqzq_f32(float32x4_t __a) { return (uint32x4_t)(__a == 0.0f); } -FUNK uint64x2_t vceqzq_f64(float64x2_t __a) { +__funline uint64x2_t vceqzq_f64(float64x2_t __a) { return (uint64x2_t)(__a == 0.0f); } -FUNK uint8x16_t vceqzq_p8(poly8x16_t __a) { +__funline uint8x16_t vceqzq_p8(poly8x16_t __a) { return (uint8x16_t)(__a == 0); } -FUNK uint8x16_t vceqzq_s8(int8x16_t __a) { +__funline uint8x16_t vceqzq_s8(int8x16_t __a) { return (uint8x16_t)(__a == 0); } -FUNK uint16x8_t vceqzq_s16(int16x8_t __a) { +__funline uint16x8_t vceqzq_s16(int16x8_t __a) { return (uint16x8_t)(__a == 0); } -FUNK uint32x4_t vceqzq_s32(int32x4_t __a) { +__funline uint32x4_t vceqzq_s32(int32x4_t __a) { return (uint32x4_t)(__a == 0); } -FUNK uint64x2_t vceqzq_s64(int64x2_t __a) { +__funline uint64x2_t vceqzq_s64(int64x2_t __a) { return (uint64x2_t)(__a == __AARCH64_INT64_C(0)); } -FUNK uint8x16_t vceqzq_u8(uint8x16_t __a) { +__funline uint8x16_t vceqzq_u8(uint8x16_t __a) { return (__a == 0); } -FUNK uint16x8_t vceqzq_u16(uint16x8_t __a) { +__funline uint16x8_t vceqzq_u16(uint16x8_t __a) { return (__a == 0); } -FUNK uint32x4_t vceqzq_u32(uint32x4_t __a) { +__funline uint32x4_t vceqzq_u32(uint32x4_t __a) { return (__a == 0); } -FUNK uint64x2_t vceqzq_u64(uint64x2_t __a) { +__funline uint64x2_t vceqzq_u64(uint64x2_t __a) { return (__a == __AARCH64_UINT64_C(0)); } -FUNK uint32_t vceqzs_f32(float32_t __a) { +__funline uint32_t vceqzs_f32(float32_t __a) { return __a == 0.0f ? -1 : 0; } -FUNK uint64_t vceqzd_s64(int64_t __a) { +__funline uint64_t vceqzd_s64(int64_t __a) { return __a == 0 ? -1ll : 0ll; } -FUNK uint64_t vceqzd_u64(uint64_t __a) { +__funline uint64_t vceqzd_u64(uint64_t __a) { return __a == 0 ? -1ll : 0ll; } -FUNK uint64_t vceqzd_f64(float64_t __a) { +__funline uint64_t vceqzd_f64(float64_t __a) { return __a == 0.0 ? -1ll : 0ll; } -FUNK uint32x2_t vcge_f32(float32x2_t __a, float32x2_t __b) { +__funline uint32x2_t vcge_f32(float32x2_t __a, float32x2_t __b) { return (uint32x2_t)(__a >= __b); } -FUNK uint64x1_t vcge_f64(float64x1_t __a, float64x1_t __b) { +__funline uint64x1_t vcge_f64(float64x1_t __a, float64x1_t __b) { return (uint64x1_t)(__a >= __b); } -FUNK uint8x8_t vcge_s8(int8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vcge_s8(int8x8_t __a, int8x8_t __b) { return (uint8x8_t)(__a >= __b); } -FUNK uint16x4_t vcge_s16(int16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vcge_s16(int16x4_t __a, int16x4_t __b) { return (uint16x4_t)(__a >= __b); } -FUNK uint32x2_t vcge_s32(int32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vcge_s32(int32x2_t __a, int32x2_t __b) { return (uint32x2_t)(__a >= __b); } -FUNK uint64x1_t vcge_s64(int64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vcge_s64(int64x1_t __a, int64x1_t __b) { return (uint64x1_t)(__a >= __b); } -FUNK uint8x8_t vcge_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vcge_u8(uint8x8_t __a, uint8x8_t __b) { return (__a >= __b); } -FUNK uint16x4_t vcge_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vcge_u16(uint16x4_t __a, uint16x4_t __b) { return (__a >= __b); } -FUNK uint32x2_t vcge_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vcge_u32(uint32x2_t __a, uint32x2_t __b) { return (__a >= __b); } -FUNK uint64x1_t vcge_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vcge_u64(uint64x1_t __a, uint64x1_t __b) { return (__a >= __b); } -FUNK uint32x4_t vcgeq_f32(float32x4_t __a, float32x4_t __b) { +__funline uint32x4_t vcgeq_f32(float32x4_t __a, float32x4_t __b) { return (uint32x4_t)(__a >= __b); } -FUNK uint64x2_t vcgeq_f64(float64x2_t __a, float64x2_t __b) { +__funline uint64x2_t vcgeq_f64(float64x2_t __a, float64x2_t __b) { return (uint64x2_t)(__a >= __b); } -FUNK uint8x16_t vcgeq_s8(int8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vcgeq_s8(int8x16_t __a, int8x16_t __b) { return (uint8x16_t)(__a >= __b); } -FUNK uint16x8_t vcgeq_s16(int16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vcgeq_s16(int16x8_t __a, int16x8_t __b) { return (uint16x8_t)(__a >= __b); } -FUNK uint32x4_t vcgeq_s32(int32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vcgeq_s32(int32x4_t __a, int32x4_t __b) { return (uint32x4_t)(__a >= __b); } -FUNK uint64x2_t vcgeq_s64(int64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vcgeq_s64(int64x2_t __a, int64x2_t __b) { return (uint64x2_t)(__a >= __b); } -FUNK uint8x16_t vcgeq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vcgeq_u8(uint8x16_t __a, uint8x16_t __b) { return (__a >= __b); } -FUNK uint16x8_t vcgeq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vcgeq_u16(uint16x8_t __a, uint16x8_t __b) { return (__a >= __b); } -FUNK uint32x4_t vcgeq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vcgeq_u32(uint32x4_t __a, uint32x4_t __b) { return (__a >= __b); } -FUNK uint64x2_t vcgeq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vcgeq_u64(uint64x2_t __a, uint64x2_t __b) { return (__a >= __b); } -FUNK uint32_t vcges_f32(float32_t __a, float32_t __b) { +__funline uint32_t vcges_f32(float32_t __a, float32_t __b) { return __a >= __b ? -1 : 0; } -FUNK uint64_t vcged_s64(int64_t __a, int64_t __b) { +__funline uint64_t vcged_s64(int64_t __a, int64_t __b) { return __a >= __b ? -1ll : 0ll; } -FUNK uint64_t vcged_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vcged_u64(uint64_t __a, uint64_t __b) { return __a >= __b ? -1ll : 0ll; } -FUNK uint64_t vcged_f64(float64_t __a, float64_t __b) { +__funline uint64_t vcged_f64(float64_t __a, float64_t __b) { return __a >= __b ? -1ll : 0ll; } -FUNK uint32x2_t vcgez_f32(float32x2_t __a) { +__funline uint32x2_t vcgez_f32(float32x2_t __a) { return (uint32x2_t)(__a >= 0.0f); } -FUNK uint64x1_t vcgez_f64(float64x1_t __a) { +__funline uint64x1_t vcgez_f64(float64x1_t __a) { return (uint64x1_t)(__a[0] >= (float64x1_t){0.0}); } -FUNK uint8x8_t vcgez_s8(int8x8_t __a) { +__funline uint8x8_t vcgez_s8(int8x8_t __a) { return (uint8x8_t)(__a >= 0); } -FUNK uint16x4_t vcgez_s16(int16x4_t __a) { +__funline uint16x4_t vcgez_s16(int16x4_t __a) { return (uint16x4_t)(__a >= 0); } -FUNK uint32x2_t vcgez_s32(int32x2_t __a) { +__funline uint32x2_t vcgez_s32(int32x2_t __a) { return (uint32x2_t)(__a >= 0); } -FUNK uint64x1_t vcgez_s64(int64x1_t __a) { +__funline uint64x1_t vcgez_s64(int64x1_t __a) { return (uint64x1_t)(__a >= __AARCH64_INT64_C(0)); } -FUNK uint32x4_t vcgezq_f32(float32x4_t __a) { +__funline uint32x4_t vcgezq_f32(float32x4_t __a) { return (uint32x4_t)(__a >= 0.0f); } -FUNK uint64x2_t vcgezq_f64(float64x2_t __a) { +__funline uint64x2_t vcgezq_f64(float64x2_t __a) { return (uint64x2_t)(__a >= 0.0); } -FUNK uint8x16_t vcgezq_s8(int8x16_t __a) { +__funline uint8x16_t vcgezq_s8(int8x16_t __a) { return (uint8x16_t)(__a >= 0); } -FUNK uint16x8_t vcgezq_s16(int16x8_t __a) { +__funline uint16x8_t vcgezq_s16(int16x8_t __a) { return (uint16x8_t)(__a >= 0); } -FUNK uint32x4_t vcgezq_s32(int32x4_t __a) { +__funline uint32x4_t vcgezq_s32(int32x4_t __a) { return (uint32x4_t)(__a >= 0); } -FUNK uint64x2_t vcgezq_s64(int64x2_t __a) { +__funline uint64x2_t vcgezq_s64(int64x2_t __a) { return (uint64x2_t)(__a >= __AARCH64_INT64_C(0)); } -FUNK uint32_t vcgezs_f32(float32_t __a) { +__funline uint32_t vcgezs_f32(float32_t __a) { return __a >= 0.0f ? -1 : 0; } -FUNK uint64_t vcgezd_s64(int64_t __a) { +__funline uint64_t vcgezd_s64(int64_t __a) { return __a >= 0 ? -1ll : 0ll; } -FUNK uint64_t vcgezd_f64(float64_t __a) { +__funline uint64_t vcgezd_f64(float64_t __a) { return __a >= 0.0 ? -1ll : 0ll; } -FUNK uint32x2_t vcgt_f32(float32x2_t __a, float32x2_t __b) { +__funline uint32x2_t vcgt_f32(float32x2_t __a, float32x2_t __b) { return (uint32x2_t)(__a > __b); } -FUNK uint64x1_t vcgt_f64(float64x1_t __a, float64x1_t __b) { +__funline uint64x1_t vcgt_f64(float64x1_t __a, float64x1_t __b) { return (uint64x1_t)(__a > __b); } -FUNK uint8x8_t vcgt_s8(int8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vcgt_s8(int8x8_t __a, int8x8_t __b) { return (uint8x8_t)(__a > __b); } -FUNK uint16x4_t vcgt_s16(int16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vcgt_s16(int16x4_t __a, int16x4_t __b) { return (uint16x4_t)(__a > __b); } -FUNK uint32x2_t vcgt_s32(int32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vcgt_s32(int32x2_t __a, int32x2_t __b) { return (uint32x2_t)(__a > __b); } -FUNK uint64x1_t vcgt_s64(int64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vcgt_s64(int64x1_t __a, int64x1_t __b) { return (uint64x1_t)(__a > __b); } -FUNK uint8x8_t vcgt_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vcgt_u8(uint8x8_t __a, uint8x8_t __b) { return (__a > __b); } -FUNK uint16x4_t vcgt_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vcgt_u16(uint16x4_t __a, uint16x4_t __b) { return (__a > __b); } -FUNK uint32x2_t vcgt_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vcgt_u32(uint32x2_t __a, uint32x2_t __b) { return (__a > __b); } -FUNK uint64x1_t vcgt_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vcgt_u64(uint64x1_t __a, uint64x1_t __b) { return (__a > __b); } -FUNK uint32x4_t vcgtq_f32(float32x4_t __a, float32x4_t __b) { +__funline uint32x4_t vcgtq_f32(float32x4_t __a, float32x4_t __b) { return (uint32x4_t)(__a > __b); } -FUNK uint64x2_t vcgtq_f64(float64x2_t __a, float64x2_t __b) { +__funline uint64x2_t vcgtq_f64(float64x2_t __a, float64x2_t __b) { return (uint64x2_t)(__a > __b); } -FUNK uint8x16_t vcgtq_s8(int8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vcgtq_s8(int8x16_t __a, int8x16_t __b) { return (uint8x16_t)(__a > __b); } -FUNK uint16x8_t vcgtq_s16(int16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vcgtq_s16(int16x8_t __a, int16x8_t __b) { return (uint16x8_t)(__a > __b); } -FUNK uint32x4_t vcgtq_s32(int32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vcgtq_s32(int32x4_t __a, int32x4_t __b) { return (uint32x4_t)(__a > __b); } -FUNK uint64x2_t vcgtq_s64(int64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vcgtq_s64(int64x2_t __a, int64x2_t __b) { return (uint64x2_t)(__a > __b); } -FUNK uint8x16_t vcgtq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vcgtq_u8(uint8x16_t __a, uint8x16_t __b) { return (__a > __b); } -FUNK uint16x8_t vcgtq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vcgtq_u16(uint16x8_t __a, uint16x8_t __b) { return (__a > __b); } -FUNK uint32x4_t vcgtq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vcgtq_u32(uint32x4_t __a, uint32x4_t __b) { return (__a > __b); } -FUNK uint64x2_t vcgtq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vcgtq_u64(uint64x2_t __a, uint64x2_t __b) { return (__a > __b); } -FUNK uint32_t vcgts_f32(float32_t __a, float32_t __b) { +__funline uint32_t vcgts_f32(float32_t __a, float32_t __b) { return __a > __b ? -1 : 0; } -FUNK uint64_t vcgtd_s64(int64_t __a, int64_t __b) { +__funline uint64_t vcgtd_s64(int64_t __a, int64_t __b) { return __a > __b ? -1ll : 0ll; } -FUNK uint64_t vcgtd_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vcgtd_u64(uint64_t __a, uint64_t __b) { return __a > __b ? -1ll : 0ll; } -FUNK uint64_t vcgtd_f64(float64_t __a, float64_t __b) { +__funline uint64_t vcgtd_f64(float64_t __a, float64_t __b) { return __a > __b ? -1ll : 0ll; } -FUNK uint32x2_t vcgtz_f32(float32x2_t __a) { +__funline uint32x2_t vcgtz_f32(float32x2_t __a) { return (uint32x2_t)(__a > 0.0f); } -FUNK uint64x1_t vcgtz_f64(float64x1_t __a) { +__funline uint64x1_t vcgtz_f64(float64x1_t __a) { return (uint64x1_t)(__a > (float64x1_t){0.0}); } -FUNK uint8x8_t vcgtz_s8(int8x8_t __a) { +__funline uint8x8_t vcgtz_s8(int8x8_t __a) { return (uint8x8_t)(__a > 0); } -FUNK uint16x4_t vcgtz_s16(int16x4_t __a) { +__funline uint16x4_t vcgtz_s16(int16x4_t __a) { return (uint16x4_t)(__a > 0); } -FUNK uint32x2_t vcgtz_s32(int32x2_t __a) { +__funline uint32x2_t vcgtz_s32(int32x2_t __a) { return (uint32x2_t)(__a > 0); } -FUNK uint64x1_t vcgtz_s64(int64x1_t __a) { +__funline uint64x1_t vcgtz_s64(int64x1_t __a) { return (uint64x1_t)(__a > __AARCH64_INT64_C(0)); } -FUNK uint32x4_t vcgtzq_f32(float32x4_t __a) { +__funline uint32x4_t vcgtzq_f32(float32x4_t __a) { return (uint32x4_t)(__a > 0.0f); } -FUNK uint64x2_t vcgtzq_f64(float64x2_t __a) { +__funline uint64x2_t vcgtzq_f64(float64x2_t __a) { return (uint64x2_t)(__a > 0.0); } -FUNK uint8x16_t vcgtzq_s8(int8x16_t __a) { +__funline uint8x16_t vcgtzq_s8(int8x16_t __a) { return (uint8x16_t)(__a > 0); } -FUNK uint16x8_t vcgtzq_s16(int16x8_t __a) { +__funline uint16x8_t vcgtzq_s16(int16x8_t __a) { return (uint16x8_t)(__a > 0); } -FUNK uint32x4_t vcgtzq_s32(int32x4_t __a) { +__funline uint32x4_t vcgtzq_s32(int32x4_t __a) { return (uint32x4_t)(__a > 0); } -FUNK uint64x2_t vcgtzq_s64(int64x2_t __a) { +__funline uint64x2_t vcgtzq_s64(int64x2_t __a) { return (uint64x2_t)(__a > __AARCH64_INT64_C(0)); } -FUNK uint32_t vcgtzs_f32(float32_t __a) { +__funline uint32_t vcgtzs_f32(float32_t __a) { return __a > 0.0f ? -1 : 0; } -FUNK uint64_t vcgtzd_s64(int64_t __a) { +__funline uint64_t vcgtzd_s64(int64_t __a) { return __a > 0 ? -1ll : 0ll; } -FUNK uint64_t vcgtzd_f64(float64_t __a) { +__funline uint64_t vcgtzd_f64(float64_t __a) { return __a > 0.0 ? -1ll : 0ll; } -FUNK uint32x2_t vcle_f32(float32x2_t __a, float32x2_t __b) { +__funline uint32x2_t vcle_f32(float32x2_t __a, float32x2_t __b) { return (uint32x2_t)(__a <= __b); } -FUNK uint64x1_t vcle_f64(float64x1_t __a, float64x1_t __b) { +__funline uint64x1_t vcle_f64(float64x1_t __a, float64x1_t __b) { return (uint64x1_t)(__a <= __b); } -FUNK uint8x8_t vcle_s8(int8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vcle_s8(int8x8_t __a, int8x8_t __b) { return (uint8x8_t)(__a <= __b); } -FUNK uint16x4_t vcle_s16(int16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vcle_s16(int16x4_t __a, int16x4_t __b) { return (uint16x4_t)(__a <= __b); } -FUNK uint32x2_t vcle_s32(int32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vcle_s32(int32x2_t __a, int32x2_t __b) { return (uint32x2_t)(__a <= __b); } -FUNK uint64x1_t vcle_s64(int64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vcle_s64(int64x1_t __a, int64x1_t __b) { return (uint64x1_t)(__a <= __b); } -FUNK uint8x8_t vcle_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vcle_u8(uint8x8_t __a, uint8x8_t __b) { return (__a <= __b); } -FUNK uint16x4_t vcle_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vcle_u16(uint16x4_t __a, uint16x4_t __b) { return (__a <= __b); } -FUNK uint32x2_t vcle_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vcle_u32(uint32x2_t __a, uint32x2_t __b) { return (__a <= __b); } -FUNK uint64x1_t vcle_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vcle_u64(uint64x1_t __a, uint64x1_t __b) { return (__a <= __b); } -FUNK uint32x4_t vcleq_f32(float32x4_t __a, float32x4_t __b) { +__funline uint32x4_t vcleq_f32(float32x4_t __a, float32x4_t __b) { return (uint32x4_t)(__a <= __b); } -FUNK uint64x2_t vcleq_f64(float64x2_t __a, float64x2_t __b) { +__funline uint64x2_t vcleq_f64(float64x2_t __a, float64x2_t __b) { return (uint64x2_t)(__a <= __b); } -FUNK uint8x16_t vcleq_s8(int8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vcleq_s8(int8x16_t __a, int8x16_t __b) { return (uint8x16_t)(__a <= __b); } -FUNK uint16x8_t vcleq_s16(int16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vcleq_s16(int16x8_t __a, int16x8_t __b) { return (uint16x8_t)(__a <= __b); } -FUNK uint32x4_t vcleq_s32(int32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vcleq_s32(int32x4_t __a, int32x4_t __b) { return (uint32x4_t)(__a <= __b); } -FUNK uint64x2_t vcleq_s64(int64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vcleq_s64(int64x2_t __a, int64x2_t __b) { return (uint64x2_t)(__a <= __b); } -FUNK uint8x16_t vcleq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vcleq_u8(uint8x16_t __a, uint8x16_t __b) { return (__a <= __b); } -FUNK uint16x8_t vcleq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vcleq_u16(uint16x8_t __a, uint16x8_t __b) { return (__a <= __b); } -FUNK uint32x4_t vcleq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vcleq_u32(uint32x4_t __a, uint32x4_t __b) { return (__a <= __b); } -FUNK uint64x2_t vcleq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vcleq_u64(uint64x2_t __a, uint64x2_t __b) { return (__a <= __b); } -FUNK uint32_t vcles_f32(float32_t __a, float32_t __b) { +__funline uint32_t vcles_f32(float32_t __a, float32_t __b) { return __a <= __b ? -1 : 0; } -FUNK uint64_t vcled_s64(int64_t __a, int64_t __b) { +__funline uint64_t vcled_s64(int64_t __a, int64_t __b) { return __a <= __b ? -1ll : 0ll; } -FUNK uint64_t vcled_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vcled_u64(uint64_t __a, uint64_t __b) { return __a <= __b ? -1ll : 0ll; } -FUNK uint64_t vcled_f64(float64_t __a, float64_t __b) { +__funline uint64_t vcled_f64(float64_t __a, float64_t __b) { return __a <= __b ? -1ll : 0ll; } -FUNK uint32x2_t vclez_f32(float32x2_t __a) { +__funline uint32x2_t vclez_f32(float32x2_t __a) { return (uint32x2_t)(__a <= 0.0f); } -FUNK uint64x1_t vclez_f64(float64x1_t __a) { +__funline uint64x1_t vclez_f64(float64x1_t __a) { return (uint64x1_t)(__a <= (float64x1_t){0.0}); } -FUNK uint8x8_t vclez_s8(int8x8_t __a) { +__funline uint8x8_t vclez_s8(int8x8_t __a) { return (uint8x8_t)(__a <= 0); } -FUNK uint16x4_t vclez_s16(int16x4_t __a) { +__funline uint16x4_t vclez_s16(int16x4_t __a) { return (uint16x4_t)(__a <= 0); } -FUNK uint32x2_t vclez_s32(int32x2_t __a) { +__funline uint32x2_t vclez_s32(int32x2_t __a) { return (uint32x2_t)(__a <= 0); } -FUNK uint64x1_t vclez_s64(int64x1_t __a) { +__funline uint64x1_t vclez_s64(int64x1_t __a) { return (uint64x1_t)(__a <= __AARCH64_INT64_C(0)); } -FUNK uint32x4_t vclezq_f32(float32x4_t __a) { +__funline uint32x4_t vclezq_f32(float32x4_t __a) { return (uint32x4_t)(__a <= 0.0f); } -FUNK uint64x2_t vclezq_f64(float64x2_t __a) { +__funline uint64x2_t vclezq_f64(float64x2_t __a) { return (uint64x2_t)(__a <= 0.0); } -FUNK uint8x16_t vclezq_s8(int8x16_t __a) { +__funline uint8x16_t vclezq_s8(int8x16_t __a) { return (uint8x16_t)(__a <= 0); } -FUNK uint16x8_t vclezq_s16(int16x8_t __a) { +__funline uint16x8_t vclezq_s16(int16x8_t __a) { return (uint16x8_t)(__a <= 0); } -FUNK uint32x4_t vclezq_s32(int32x4_t __a) { +__funline uint32x4_t vclezq_s32(int32x4_t __a) { return (uint32x4_t)(__a <= 0); } -FUNK uint64x2_t vclezq_s64(int64x2_t __a) { +__funline uint64x2_t vclezq_s64(int64x2_t __a) { return (uint64x2_t)(__a <= __AARCH64_INT64_C(0)); } -FUNK uint32_t vclezs_f32(float32_t __a) { +__funline uint32_t vclezs_f32(float32_t __a) { return __a <= 0.0f ? -1 : 0; } -FUNK uint64_t vclezd_s64(int64_t __a) { +__funline uint64_t vclezd_s64(int64_t __a) { return __a <= 0 ? -1ll : 0ll; } -FUNK uint64_t vclezd_f64(float64_t __a) { +__funline uint64_t vclezd_f64(float64_t __a) { return __a <= 0.0 ? -1ll : 0ll; } -FUNK uint32x2_t vclt_f32(float32x2_t __a, float32x2_t __b) { +__funline uint32x2_t vclt_f32(float32x2_t __a, float32x2_t __b) { return (uint32x2_t)(__a < __b); } -FUNK uint64x1_t vclt_f64(float64x1_t __a, float64x1_t __b) { +__funline uint64x1_t vclt_f64(float64x1_t __a, float64x1_t __b) { return (uint64x1_t)(__a < __b); } -FUNK uint8x8_t vclt_s8(int8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vclt_s8(int8x8_t __a, int8x8_t __b) { return (uint8x8_t)(__a < __b); } -FUNK uint16x4_t vclt_s16(int16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vclt_s16(int16x4_t __a, int16x4_t __b) { return (uint16x4_t)(__a < __b); } -FUNK uint32x2_t vclt_s32(int32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vclt_s32(int32x2_t __a, int32x2_t __b) { return (uint32x2_t)(__a < __b); } -FUNK uint64x1_t vclt_s64(int64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vclt_s64(int64x1_t __a, int64x1_t __b) { return (uint64x1_t)(__a < __b); } -FUNK uint8x8_t vclt_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vclt_u8(uint8x8_t __a, uint8x8_t __b) { return (__a < __b); } -FUNK uint16x4_t vclt_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vclt_u16(uint16x4_t __a, uint16x4_t __b) { return (__a < __b); } -FUNK uint32x2_t vclt_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vclt_u32(uint32x2_t __a, uint32x2_t __b) { return (__a < __b); } -FUNK uint64x1_t vclt_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vclt_u64(uint64x1_t __a, uint64x1_t __b) { return (__a < __b); } -FUNK uint32x4_t vcltq_f32(float32x4_t __a, float32x4_t __b) { +__funline uint32x4_t vcltq_f32(float32x4_t __a, float32x4_t __b) { return (uint32x4_t)(__a < __b); } -FUNK uint64x2_t vcltq_f64(float64x2_t __a, float64x2_t __b) { +__funline uint64x2_t vcltq_f64(float64x2_t __a, float64x2_t __b) { return (uint64x2_t)(__a < __b); } -FUNK uint8x16_t vcltq_s8(int8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vcltq_s8(int8x16_t __a, int8x16_t __b) { return (uint8x16_t)(__a < __b); } -FUNK uint16x8_t vcltq_s16(int16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vcltq_s16(int16x8_t __a, int16x8_t __b) { return (uint16x8_t)(__a < __b); } -FUNK uint32x4_t vcltq_s32(int32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vcltq_s32(int32x4_t __a, int32x4_t __b) { return (uint32x4_t)(__a < __b); } -FUNK uint64x2_t vcltq_s64(int64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vcltq_s64(int64x2_t __a, int64x2_t __b) { return (uint64x2_t)(__a < __b); } -FUNK uint8x16_t vcltq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vcltq_u8(uint8x16_t __a, uint8x16_t __b) { return (__a < __b); } -FUNK uint16x8_t vcltq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vcltq_u16(uint16x8_t __a, uint16x8_t __b) { return (__a < __b); } -FUNK uint32x4_t vcltq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vcltq_u32(uint32x4_t __a, uint32x4_t __b) { return (__a < __b); } -FUNK uint64x2_t vcltq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vcltq_u64(uint64x2_t __a, uint64x2_t __b) { return (__a < __b); } -FUNK uint32_t vclts_f32(float32_t __a, float32_t __b) { +__funline uint32_t vclts_f32(float32_t __a, float32_t __b) { return __a < __b ? -1 : 0; } -FUNK uint64_t vcltd_s64(int64_t __a, int64_t __b) { +__funline uint64_t vcltd_s64(int64_t __a, int64_t __b) { return __a < __b ? -1ll : 0ll; } -FUNK uint64_t vcltd_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vcltd_u64(uint64_t __a, uint64_t __b) { return __a < __b ? -1ll : 0ll; } -FUNK uint64_t vcltd_f64(float64_t __a, float64_t __b) { +__funline uint64_t vcltd_f64(float64_t __a, float64_t __b) { return __a < __b ? -1ll : 0ll; } -FUNK uint32x2_t vcltz_f32(float32x2_t __a) { +__funline uint32x2_t vcltz_f32(float32x2_t __a) { return (uint32x2_t)(__a < 0.0f); } -FUNK uint64x1_t vcltz_f64(float64x1_t __a) { +__funline uint64x1_t vcltz_f64(float64x1_t __a) { return (uint64x1_t)(__a < (float64x1_t){0.0}); } -FUNK uint8x8_t vcltz_s8(int8x8_t __a) { +__funline uint8x8_t vcltz_s8(int8x8_t __a) { return (uint8x8_t)(__a < 0); } -FUNK uint16x4_t vcltz_s16(int16x4_t __a) { +__funline uint16x4_t vcltz_s16(int16x4_t __a) { return (uint16x4_t)(__a < 0); } -FUNK uint32x2_t vcltz_s32(int32x2_t __a) { +__funline uint32x2_t vcltz_s32(int32x2_t __a) { return (uint32x2_t)(__a < 0); } -FUNK uint64x1_t vcltz_s64(int64x1_t __a) { +__funline uint64x1_t vcltz_s64(int64x1_t __a) { return (uint64x1_t)(__a < __AARCH64_INT64_C(0)); } -FUNK uint32x4_t vcltzq_f32(float32x4_t __a) { +__funline uint32x4_t vcltzq_f32(float32x4_t __a) { return (uint32x4_t)(__a < 0.0f); } -FUNK uint64x2_t vcltzq_f64(float64x2_t __a) { +__funline uint64x2_t vcltzq_f64(float64x2_t __a) { return (uint64x2_t)(__a < 0.0); } -FUNK uint8x16_t vcltzq_s8(int8x16_t __a) { +__funline uint8x16_t vcltzq_s8(int8x16_t __a) { return (uint8x16_t)(__a < 0); } -FUNK uint16x8_t vcltzq_s16(int16x8_t __a) { +__funline uint16x8_t vcltzq_s16(int16x8_t __a) { return (uint16x8_t)(__a < 0); } -FUNK uint32x4_t vcltzq_s32(int32x4_t __a) { +__funline uint32x4_t vcltzq_s32(int32x4_t __a) { return (uint32x4_t)(__a < 0); } -FUNK uint64x2_t vcltzq_s64(int64x2_t __a) { +__funline uint64x2_t vcltzq_s64(int64x2_t __a) { return (uint64x2_t)(__a < __AARCH64_INT64_C(0)); } -FUNK uint32_t vcltzs_f32(float32_t __a) { +__funline uint32_t vcltzs_f32(float32_t __a) { return __a < 0.0f ? -1 : 0; } -FUNK uint64_t vcltzd_s64(int64_t __a) { +__funline uint64_t vcltzd_s64(int64_t __a) { return __a < 0 ? -1ll : 0ll; } -FUNK uint64_t vcltzd_f64(float64_t __a) { +__funline uint64_t vcltzd_f64(float64_t __a) { return __a < 0.0 ? -1ll : 0ll; } -FUNK int8x8_t vcls_s8(int8x8_t __a) { +__funline int8x8_t vcls_s8(int8x8_t __a) { return __builtin_aarch64_clrsbv8qi(__a); } -FUNK int16x4_t vcls_s16(int16x4_t __a) { +__funline int16x4_t vcls_s16(int16x4_t __a) { return __builtin_aarch64_clrsbv4hi(__a); } -FUNK int32x2_t vcls_s32(int32x2_t __a) { +__funline int32x2_t vcls_s32(int32x2_t __a) { return __builtin_aarch64_clrsbv2si(__a); } -FUNK int8x16_t vclsq_s8(int8x16_t __a) { +__funline int8x16_t vclsq_s8(int8x16_t __a) { return __builtin_aarch64_clrsbv16qi(__a); } -FUNK int16x8_t vclsq_s16(int16x8_t __a) { +__funline int16x8_t vclsq_s16(int16x8_t __a) { return __builtin_aarch64_clrsbv8hi(__a); } -FUNK int32x4_t vclsq_s32(int32x4_t __a) { +__funline int32x4_t vclsq_s32(int32x4_t __a) { return __builtin_aarch64_clrsbv4si(__a); } -FUNK int8x8_t vclz_s8(int8x8_t __a) { +__funline int8x8_t vclz_s8(int8x8_t __a) { return __builtin_aarch64_clzv8qi(__a); } -FUNK int16x4_t vclz_s16(int16x4_t __a) { +__funline int16x4_t vclz_s16(int16x4_t __a) { return __builtin_aarch64_clzv4hi(__a); } -FUNK int32x2_t vclz_s32(int32x2_t __a) { +__funline int32x2_t vclz_s32(int32x2_t __a) { return __builtin_aarch64_clzv2si(__a); } -FUNK uint8x8_t vclz_u8(uint8x8_t __a) { +__funline uint8x8_t vclz_u8(uint8x8_t __a) { return (uint8x8_t)__builtin_aarch64_clzv8qi((int8x8_t)__a); } -FUNK uint16x4_t vclz_u16(uint16x4_t __a) { +__funline uint16x4_t vclz_u16(uint16x4_t __a) { return (uint16x4_t)__builtin_aarch64_clzv4hi((int16x4_t)__a); } -FUNK uint32x2_t vclz_u32(uint32x2_t __a) { +__funline uint32x2_t vclz_u32(uint32x2_t __a) { return (uint32x2_t)__builtin_aarch64_clzv2si((int32x2_t)__a); } -FUNK int8x16_t vclzq_s8(int8x16_t __a) { +__funline int8x16_t vclzq_s8(int8x16_t __a) { return __builtin_aarch64_clzv16qi(__a); } -FUNK int16x8_t vclzq_s16(int16x8_t __a) { +__funline int16x8_t vclzq_s16(int16x8_t __a) { return __builtin_aarch64_clzv8hi(__a); } -FUNK int32x4_t vclzq_s32(int32x4_t __a) { +__funline int32x4_t vclzq_s32(int32x4_t __a) { return __builtin_aarch64_clzv4si(__a); } -FUNK uint8x16_t vclzq_u8(uint8x16_t __a) { +__funline uint8x16_t vclzq_u8(uint8x16_t __a) { return (uint8x16_t)__builtin_aarch64_clzv16qi((int8x16_t)__a); } -FUNK uint16x8_t vclzq_u16(uint16x8_t __a) { +__funline uint16x8_t vclzq_u16(uint16x8_t __a) { return (uint16x8_t)__builtin_aarch64_clzv8hi((int16x8_t)__a); } -FUNK uint32x4_t vclzq_u32(uint32x4_t __a) { +__funline uint32x4_t vclzq_u32(uint32x4_t __a) { return (uint32x4_t)__builtin_aarch64_clzv4si((int32x4_t)__a); } -FUNK poly8x8_t vcnt_p8(poly8x8_t __a) { +__funline poly8x8_t vcnt_p8(poly8x8_t __a) { return (poly8x8_t)__builtin_aarch64_popcountv8qi((int8x8_t)__a); } -FUNK int8x8_t vcnt_s8(int8x8_t __a) { +__funline int8x8_t vcnt_s8(int8x8_t __a) { return __builtin_aarch64_popcountv8qi(__a); } -FUNK uint8x8_t vcnt_u8(uint8x8_t __a) { +__funline uint8x8_t vcnt_u8(uint8x8_t __a) { return (uint8x8_t)__builtin_aarch64_popcountv8qi((int8x8_t)__a); } -FUNK poly8x16_t vcntq_p8(poly8x16_t __a) { +__funline poly8x16_t vcntq_p8(poly8x16_t __a) { return (poly8x16_t)__builtin_aarch64_popcountv16qi((int8x16_t)__a); } -FUNK int8x16_t vcntq_s8(int8x16_t __a) { +__funline int8x16_t vcntq_s8(int8x16_t __a) { return __builtin_aarch64_popcountv16qi(__a); } -FUNK uint8x16_t vcntq_u8(uint8x16_t __a) { +__funline uint8x16_t vcntq_u8(uint8x16_t __a) { return (uint8x16_t)__builtin_aarch64_popcountv16qi((int8x16_t)__a); } -FUNK float32x2_t vcopy_lane_f32(float32x2_t __a, const int __lane1, - float32x2_t __b, const int __lane2) { +__funline float32x2_t vcopy_lane_f32(float32x2_t __a, const int __lane1, + float32x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK float64x1_t vcopy_lane_f64(float64x1_t __a, const int __lane1, - float64x1_t __b, const int __lane2) { +__funline float64x1_t vcopy_lane_f64(float64x1_t __a, const int __lane1, + float64x1_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly8x8_t vcopy_lane_p8(poly8x8_t __a, const int __lane1, poly8x8_t __b, - const int __lane2) { +__funline poly8x8_t vcopy_lane_p8(poly8x8_t __a, const int __lane1, + poly8x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly16x4_t vcopy_lane_p16(poly16x4_t __a, const int __lane1, - poly16x4_t __b, const int __lane2) { +__funline poly16x4_t vcopy_lane_p16(poly16x4_t __a, const int __lane1, + poly16x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly64x1_t vcopy_lane_p64(poly64x1_t __a, const int __lane1, - poly64x1_t __b, const int __lane2) { +__funline poly64x1_t vcopy_lane_p64(poly64x1_t __a, const int __lane1, + poly64x1_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int8x8_t vcopy_lane_s8(int8x8_t __a, const int __lane1, int8x8_t __b, - const int __lane2) { +__funline int8x8_t vcopy_lane_s8(int8x8_t __a, const int __lane1, int8x8_t __b, + const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int16x4_t vcopy_lane_s16(int16x4_t __a, const int __lane1, int16x4_t __b, - const int __lane2) { +__funline int16x4_t vcopy_lane_s16(int16x4_t __a, const int __lane1, + int16x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int32x2_t vcopy_lane_s32(int32x2_t __a, const int __lane1, int32x2_t __b, - const int __lane2) { +__funline int32x2_t vcopy_lane_s32(int32x2_t __a, const int __lane1, + int32x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int64x1_t vcopy_lane_s64(int64x1_t __a, const int __lane1, int64x1_t __b, - const int __lane2) { +__funline int64x1_t vcopy_lane_s64(int64x1_t __a, const int __lane1, + int64x1_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint8x8_t vcopy_lane_u8(uint8x8_t __a, const int __lane1, uint8x8_t __b, - const int __lane2) { +__funline uint8x8_t vcopy_lane_u8(uint8x8_t __a, const int __lane1, + uint8x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint16x4_t vcopy_lane_u16(uint16x4_t __a, const int __lane1, - uint16x4_t __b, const int __lane2) { +__funline uint16x4_t vcopy_lane_u16(uint16x4_t __a, const int __lane1, + uint16x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint32x2_t vcopy_lane_u32(uint32x2_t __a, const int __lane1, - uint32x2_t __b, const int __lane2) { +__funline uint32x2_t vcopy_lane_u32(uint32x2_t __a, const int __lane1, + uint32x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint64x1_t vcopy_lane_u64(uint64x1_t __a, const int __lane1, - uint64x1_t __b, const int __lane2) { +__funline uint64x1_t vcopy_lane_u64(uint64x1_t __a, const int __lane1, + uint64x1_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK float32x2_t vcopy_laneq_f32(float32x2_t __a, const int __lane1, - float32x4_t __b, const int __lane2) { +__funline float32x2_t vcopy_laneq_f32(float32x2_t __a, const int __lane1, + float32x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK float64x1_t vcopy_laneq_f64(float64x1_t __a, const int __lane1, - float64x2_t __b, const int __lane2) { +__funline float64x1_t vcopy_laneq_f64(float64x1_t __a, const int __lane1, + float64x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly8x8_t vcopy_laneq_p8(poly8x8_t __a, const int __lane1, poly8x16_t __b, - const int __lane2) { +__funline poly8x8_t vcopy_laneq_p8(poly8x8_t __a, const int __lane1, + poly8x16_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly16x4_t vcopy_laneq_p16(poly16x4_t __a, const int __lane1, - poly16x8_t __b, const int __lane2) { +__funline poly16x4_t vcopy_laneq_p16(poly16x4_t __a, const int __lane1, + poly16x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly64x1_t vcopy_laneq_p64(poly64x1_t __a, const int __lane1, - poly64x2_t __b, const int __lane2) { +__funline poly64x1_t vcopy_laneq_p64(poly64x1_t __a, const int __lane1, + poly64x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int8x8_t vcopy_laneq_s8(int8x8_t __a, const int __lane1, int8x16_t __b, - const int __lane2) { +__funline int8x8_t vcopy_laneq_s8(int8x8_t __a, const int __lane1, + int8x16_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int16x4_t vcopy_laneq_s16(int16x4_t __a, const int __lane1, int16x8_t __b, - const int __lane2) { +__funline int16x4_t vcopy_laneq_s16(int16x4_t __a, const int __lane1, + int16x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int32x2_t vcopy_laneq_s32(int32x2_t __a, const int __lane1, int32x4_t __b, - const int __lane2) { +__funline int32x2_t vcopy_laneq_s32(int32x2_t __a, const int __lane1, + int32x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int64x1_t vcopy_laneq_s64(int64x1_t __a, const int __lane1, int64x2_t __b, - const int __lane2) { +__funline int64x1_t vcopy_laneq_s64(int64x1_t __a, const int __lane1, + int64x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint8x8_t vcopy_laneq_u8(uint8x8_t __a, const int __lane1, uint8x16_t __b, - const int __lane2) { +__funline uint8x8_t vcopy_laneq_u8(uint8x8_t __a, const int __lane1, + uint8x16_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint16x4_t vcopy_laneq_u16(uint16x4_t __a, const int __lane1, - uint16x8_t __b, const int __lane2) { +__funline uint16x4_t vcopy_laneq_u16(uint16x4_t __a, const int __lane1, + uint16x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint32x2_t vcopy_laneq_u32(uint32x2_t __a, const int __lane1, - uint32x4_t __b, const int __lane2) { +__funline uint32x2_t vcopy_laneq_u32(uint32x2_t __a, const int __lane1, + uint32x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint64x1_t vcopy_laneq_u64(uint64x1_t __a, const int __lane1, - uint64x2_t __b, const int __lane2) { +__funline uint64x1_t vcopy_laneq_u64(uint64x1_t __a, const int __lane1, + uint64x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK float32x4_t vcopyq_lane_f32(float32x4_t __a, const int __lane1, - float32x2_t __b, const int __lane2) { +__funline float32x4_t vcopyq_lane_f32(float32x4_t __a, const int __lane1, + float32x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK float64x2_t vcopyq_lane_f64(float64x2_t __a, const int __lane1, - float64x1_t __b, const int __lane2) { +__funline float64x2_t vcopyq_lane_f64(float64x2_t __a, const int __lane1, + float64x1_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly8x16_t vcopyq_lane_p8(poly8x16_t __a, const int __lane1, poly8x8_t __b, - const int __lane2) { +__funline poly8x16_t vcopyq_lane_p8(poly8x16_t __a, const int __lane1, + poly8x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly16x8_t vcopyq_lane_p16(poly16x8_t __a, const int __lane1, - poly16x4_t __b, const int __lane2) { +__funline poly16x8_t vcopyq_lane_p16(poly16x8_t __a, const int __lane1, + poly16x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly64x2_t vcopyq_lane_p64(poly64x2_t __a, const int __lane1, - poly64x1_t __b, const int __lane2) { +__funline poly64x2_t vcopyq_lane_p64(poly64x2_t __a, const int __lane1, + poly64x1_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int8x16_t vcopyq_lane_s8(int8x16_t __a, const int __lane1, int8x8_t __b, - const int __lane2) { +__funline int8x16_t vcopyq_lane_s8(int8x16_t __a, const int __lane1, + int8x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int16x8_t vcopyq_lane_s16(int16x8_t __a, const int __lane1, int16x4_t __b, - const int __lane2) { +__funline int16x8_t vcopyq_lane_s16(int16x8_t __a, const int __lane1, + int16x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int32x4_t vcopyq_lane_s32(int32x4_t __a, const int __lane1, int32x2_t __b, - const int __lane2) { +__funline int32x4_t vcopyq_lane_s32(int32x4_t __a, const int __lane1, + int32x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int64x2_t vcopyq_lane_s64(int64x2_t __a, const int __lane1, int64x1_t __b, - const int __lane2) { +__funline int64x2_t vcopyq_lane_s64(int64x2_t __a, const int __lane1, + int64x1_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint8x16_t vcopyq_lane_u8(uint8x16_t __a, const int __lane1, uint8x8_t __b, - const int __lane2) { +__funline uint8x16_t vcopyq_lane_u8(uint8x16_t __a, const int __lane1, + uint8x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint16x8_t vcopyq_lane_u16(uint16x8_t __a, const int __lane1, - uint16x4_t __b, const int __lane2) { +__funline uint16x8_t vcopyq_lane_u16(uint16x8_t __a, const int __lane1, + uint16x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint32x4_t vcopyq_lane_u32(uint32x4_t __a, const int __lane1, - uint32x2_t __b, const int __lane2) { +__funline uint32x4_t vcopyq_lane_u32(uint32x4_t __a, const int __lane1, + uint32x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint64x2_t vcopyq_lane_u64(uint64x2_t __a, const int __lane1, - uint64x1_t __b, const int __lane2) { +__funline uint64x2_t vcopyq_lane_u64(uint64x2_t __a, const int __lane1, + uint64x1_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK float32x4_t vcopyq_laneq_f32(float32x4_t __a, const int __lane1, - float32x4_t __b, const int __lane2) { +__funline float32x4_t vcopyq_laneq_f32(float32x4_t __a, const int __lane1, + float32x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK float64x2_t vcopyq_laneq_f64(float64x2_t __a, const int __lane1, - float64x2_t __b, const int __lane2) { +__funline float64x2_t vcopyq_laneq_f64(float64x2_t __a, const int __lane1, + float64x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly8x16_t vcopyq_laneq_p8(poly8x16_t __a, const int __lane1, - poly8x16_t __b, const int __lane2) { +__funline poly8x16_t vcopyq_laneq_p8(poly8x16_t __a, const int __lane1, + poly8x16_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly16x8_t vcopyq_laneq_p16(poly16x8_t __a, const int __lane1, - poly16x8_t __b, const int __lane2) { +__funline poly16x8_t vcopyq_laneq_p16(poly16x8_t __a, const int __lane1, + poly16x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK poly64x2_t vcopyq_laneq_p64(poly64x2_t __a, const int __lane1, - poly64x2_t __b, const int __lane2) { +__funline poly64x2_t vcopyq_laneq_p64(poly64x2_t __a, const int __lane1, + poly64x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int8x16_t vcopyq_laneq_s8(int8x16_t __a, const int __lane1, int8x16_t __b, - const int __lane2) { +__funline int8x16_t vcopyq_laneq_s8(int8x16_t __a, const int __lane1, + int8x16_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int16x8_t vcopyq_laneq_s16(int16x8_t __a, const int __lane1, int16x8_t __b, - const int __lane2) { +__funline int16x8_t vcopyq_laneq_s16(int16x8_t __a, const int __lane1, + int16x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int32x4_t vcopyq_laneq_s32(int32x4_t __a, const int __lane1, int32x4_t __b, - const int __lane2) { +__funline int32x4_t vcopyq_laneq_s32(int32x4_t __a, const int __lane1, + int32x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK int64x2_t vcopyq_laneq_s64(int64x2_t __a, const int __lane1, int64x2_t __b, - const int __lane2) { +__funline int64x2_t vcopyq_laneq_s64(int64x2_t __a, const int __lane1, + int64x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint8x16_t vcopyq_laneq_u8(uint8x16_t __a, const int __lane1, - uint8x16_t __b, const int __lane2) { +__funline uint8x16_t vcopyq_laneq_u8(uint8x16_t __a, const int __lane1, + uint8x16_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint16x8_t vcopyq_laneq_u16(uint16x8_t __a, const int __lane1, - uint16x8_t __b, const int __lane2) { +__funline uint16x8_t vcopyq_laneq_u16(uint16x8_t __a, const int __lane1, + uint16x8_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint32x4_t vcopyq_laneq_u32(uint32x4_t __a, const int __lane1, - uint32x4_t __b, const int __lane2) { +__funline uint32x4_t vcopyq_laneq_u32(uint32x4_t __a, const int __lane1, + uint32x4_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK uint64x2_t vcopyq_laneq_u64(uint64x2_t __a, const int __lane1, - uint64x2_t __b, const int __lane2) { +__funline uint64x2_t vcopyq_laneq_u64(uint64x2_t __a, const int __lane1, + uint64x2_t __b, const int __lane2) { return __aarch64_vset_lane_any(__aarch64_vget_lane_any(__b, __lane2), __a, __lane1); } -FUNK float16x4_t vcvt_f16_f32(float32x4_t __a) { +__funline float16x4_t vcvt_f16_f32(float32x4_t __a) { return __builtin_aarch64_float_truncate_lo_v4hf(__a); } -FUNK float16x8_t vcvt_high_f16_f32(float16x4_t __a, float32x4_t __b) { +__funline float16x8_t vcvt_high_f16_f32(float16x4_t __a, float32x4_t __b) { return __builtin_aarch64_float_truncate_hi_v8hf(__a, __b); } -FUNK float32x2_t vcvt_f32_f64(float64x2_t __a) { +__funline float32x2_t vcvt_f32_f64(float64x2_t __a) { return __builtin_aarch64_float_truncate_lo_v2sf(__a); } -FUNK float32x4_t vcvt_high_f32_f64(float32x2_t __a, float64x2_t __b) { +__funline float32x4_t vcvt_high_f32_f64(float32x2_t __a, float64x2_t __b) { return __builtin_aarch64_float_truncate_hi_v4sf(__a, __b); } -FUNK float32x4_t vcvt_f32_f16(float16x4_t __a) { +__funline float32x4_t vcvt_f32_f16(float16x4_t __a) { return __builtin_aarch64_float_extend_lo_v4sf(__a); } -FUNK float64x2_t vcvt_f64_f32(float32x2_t __a) { +__funline float64x2_t vcvt_f64_f32(float32x2_t __a) { return __builtin_aarch64_float_extend_lo_v2df(__a); } -FUNK float32x4_t vcvt_high_f32_f16(float16x8_t __a) { +__funline float32x4_t vcvt_high_f32_f16(float16x8_t __a) { return __builtin_aarch64_vec_unpacks_hi_v8hf(__a); } -FUNK float64x2_t vcvt_high_f64_f32(float32x4_t __a) { +__funline float64x2_t vcvt_high_f64_f32(float32x4_t __a) { return __builtin_aarch64_vec_unpacks_hi_v4sf(__a); } -FUNK float64_t vcvtd_n_f64_s64(int64_t __a, const int __b) { +__funline float64_t vcvtd_n_f64_s64(int64_t __a, const int __b) { return __builtin_aarch64_scvtfdi(__a, __b); } -FUNK float64_t vcvtd_n_f64_u64(uint64_t __a, const int __b) { +__funline float64_t vcvtd_n_f64_u64(uint64_t __a, const int __b) { return __builtin_aarch64_ucvtfdi_sus(__a, __b); } -FUNK float32_t vcvts_n_f32_s32(int32_t __a, const int __b) { +__funline float32_t vcvts_n_f32_s32(int32_t __a, const int __b) { return __builtin_aarch64_scvtfsi(__a, __b); } -FUNK float32_t vcvts_n_f32_u32(uint32_t __a, const int __b) { +__funline float32_t vcvts_n_f32_u32(uint32_t __a, const int __b) { return __builtin_aarch64_ucvtfsi_sus(__a, __b); } -FUNK float32x2_t vcvt_n_f32_s32(int32x2_t __a, const int __b) { +__funline float32x2_t vcvt_n_f32_s32(int32x2_t __a, const int __b) { return __builtin_aarch64_scvtfv2si(__a, __b); } -FUNK float32x2_t vcvt_n_f32_u32(uint32x2_t __a, const int __b) { +__funline float32x2_t vcvt_n_f32_u32(uint32x2_t __a, const int __b) { return __builtin_aarch64_ucvtfv2si_sus(__a, __b); } -FUNK float64x1_t vcvt_n_f64_s64(int64x1_t __a, const int __b) { +__funline float64x1_t vcvt_n_f64_s64(int64x1_t __a, const int __b) { return (float64x1_t){__builtin_aarch64_scvtfdi(vget_lane_s64(__a, 0), __b)}; } -FUNK float64x1_t vcvt_n_f64_u64(uint64x1_t __a, const int __b) { +__funline float64x1_t vcvt_n_f64_u64(uint64x1_t __a, const int __b) { return (float64x1_t){ __builtin_aarch64_ucvtfdi_sus(vget_lane_u64(__a, 0), __b)}; } -FUNK float32x4_t vcvtq_n_f32_s32(int32x4_t __a, const int __b) { +__funline float32x4_t vcvtq_n_f32_s32(int32x4_t __a, const int __b) { return __builtin_aarch64_scvtfv4si(__a, __b); } -FUNK float32x4_t vcvtq_n_f32_u32(uint32x4_t __a, const int __b) { +__funline float32x4_t vcvtq_n_f32_u32(uint32x4_t __a, const int __b) { return __builtin_aarch64_ucvtfv4si_sus(__a, __b); } -FUNK float64x2_t vcvtq_n_f64_s64(int64x2_t __a, const int __b) { +__funline float64x2_t vcvtq_n_f64_s64(int64x2_t __a, const int __b) { return __builtin_aarch64_scvtfv2di(__a, __b); } -FUNK float64x2_t vcvtq_n_f64_u64(uint64x2_t __a, const int __b) { +__funline float64x2_t vcvtq_n_f64_u64(uint64x2_t __a, const int __b) { return __builtin_aarch64_ucvtfv2di_sus(__a, __b); } -FUNK int64_t vcvtd_n_s64_f64(float64_t __a, const int __b) { +__funline int64_t vcvtd_n_s64_f64(float64_t __a, const int __b) { return __builtin_aarch64_fcvtzsdf(__a, __b); } -FUNK uint64_t vcvtd_n_u64_f64(float64_t __a, const int __b) { +__funline uint64_t vcvtd_n_u64_f64(float64_t __a, const int __b) { return __builtin_aarch64_fcvtzudf_uss(__a, __b); } -FUNK int32_t vcvts_n_s32_f32(float32_t __a, const int __b) { +__funline int32_t vcvts_n_s32_f32(float32_t __a, const int __b) { return __builtin_aarch64_fcvtzssf(__a, __b); } -FUNK uint32_t vcvts_n_u32_f32(float32_t __a, const int __b) { +__funline uint32_t vcvts_n_u32_f32(float32_t __a, const int __b) { return __builtin_aarch64_fcvtzusf_uss(__a, __b); } -FUNK int32x2_t vcvt_n_s32_f32(float32x2_t __a, const int __b) { +__funline int32x2_t vcvt_n_s32_f32(float32x2_t __a, const int __b) { return __builtin_aarch64_fcvtzsv2sf(__a, __b); } -FUNK uint32x2_t vcvt_n_u32_f32(float32x2_t __a, const int __b) { +__funline uint32x2_t vcvt_n_u32_f32(float32x2_t __a, const int __b) { return __builtin_aarch64_fcvtzuv2sf_uss(__a, __b); } -FUNK int64x1_t vcvt_n_s64_f64(float64x1_t __a, const int __b) { +__funline int64x1_t vcvt_n_s64_f64(float64x1_t __a, const int __b) { return (int64x1_t){__builtin_aarch64_fcvtzsdf(vget_lane_f64(__a, 0), __b)}; } -FUNK uint64x1_t vcvt_n_u64_f64(float64x1_t __a, const int __b) { +__funline uint64x1_t vcvt_n_u64_f64(float64x1_t __a, const int __b) { return (uint64x1_t){ __builtin_aarch64_fcvtzudf_uss(vget_lane_f64(__a, 0), __b)}; } -FUNK int32x4_t vcvtq_n_s32_f32(float32x4_t __a, const int __b) { +__funline int32x4_t vcvtq_n_s32_f32(float32x4_t __a, const int __b) { return __builtin_aarch64_fcvtzsv4sf(__a, __b); } -FUNK uint32x4_t vcvtq_n_u32_f32(float32x4_t __a, const int __b) { +__funline uint32x4_t vcvtq_n_u32_f32(float32x4_t __a, const int __b) { return __builtin_aarch64_fcvtzuv4sf_uss(__a, __b); } -FUNK int64x2_t vcvtq_n_s64_f64(float64x2_t __a, const int __b) { +__funline int64x2_t vcvtq_n_s64_f64(float64x2_t __a, const int __b) { return __builtin_aarch64_fcvtzsv2df(__a, __b); } -FUNK uint64x2_t vcvtq_n_u64_f64(float64x2_t __a, const int __b) { +__funline uint64x2_t vcvtq_n_u64_f64(float64x2_t __a, const int __b) { return __builtin_aarch64_fcvtzuv2df_uss(__a, __b); } -FUNK float64_t vcvtd_f64_s64(int64_t __a) { +__funline float64_t vcvtd_f64_s64(int64_t __a) { return (float64_t)__a; } -FUNK float64_t vcvtd_f64_u64(uint64_t __a) { +__funline float64_t vcvtd_f64_u64(uint64_t __a) { return (float64_t)__a; } -FUNK float32_t vcvts_f32_s32(int32_t __a) { +__funline float32_t vcvts_f32_s32(int32_t __a) { return (float32_t)__a; } -FUNK float32_t vcvts_f32_u32(uint32_t __a) { +__funline float32_t vcvts_f32_u32(uint32_t __a) { return (float32_t)__a; } -FUNK float32x2_t vcvt_f32_s32(int32x2_t __a) { +__funline float32x2_t vcvt_f32_s32(int32x2_t __a) { return __builtin_aarch64_floatv2siv2sf(__a); } -FUNK float32x2_t vcvt_f32_u32(uint32x2_t __a) { +__funline float32x2_t vcvt_f32_u32(uint32x2_t __a) { return __builtin_aarch64_floatunsv2siv2sf((int32x2_t)__a); } -FUNK float64x1_t vcvt_f64_s64(int64x1_t __a) { +__funline float64x1_t vcvt_f64_s64(int64x1_t __a) { return (float64x1_t){vget_lane_s64(__a, 0)}; } -FUNK float64x1_t vcvt_f64_u64(uint64x1_t __a) { +__funline float64x1_t vcvt_f64_u64(uint64x1_t __a) { return (float64x1_t){vget_lane_u64(__a, 0)}; } -FUNK float32x4_t vcvtq_f32_s32(int32x4_t __a) { +__funline float32x4_t vcvtq_f32_s32(int32x4_t __a) { return __builtin_aarch64_floatv4siv4sf(__a); } -FUNK float32x4_t vcvtq_f32_u32(uint32x4_t __a) { +__funline float32x4_t vcvtq_f32_u32(uint32x4_t __a) { return __builtin_aarch64_floatunsv4siv4sf((int32x4_t)__a); } -FUNK float64x2_t vcvtq_f64_s64(int64x2_t __a) { +__funline float64x2_t vcvtq_f64_s64(int64x2_t __a) { return __builtin_aarch64_floatv2div2df(__a); } -FUNK float64x2_t vcvtq_f64_u64(uint64x2_t __a) { +__funline float64x2_t vcvtq_f64_u64(uint64x2_t __a) { return __builtin_aarch64_floatunsv2div2df((int64x2_t)__a); } -FUNK int64_t vcvtd_s64_f64(float64_t __a) { +__funline int64_t vcvtd_s64_f64(float64_t __a) { return (int64_t)__a; } -FUNK uint64_t vcvtd_u64_f64(float64_t __a) { +__funline uint64_t vcvtd_u64_f64(float64_t __a) { return (uint64_t)__a; } -FUNK int32_t vcvts_s32_f32(float32_t __a) { +__funline int32_t vcvts_s32_f32(float32_t __a) { return (int32_t)__a; } -FUNK uint32_t vcvts_u32_f32(float32_t __a) { +__funline uint32_t vcvts_u32_f32(float32_t __a) { return (uint32_t)__a; } -FUNK int32x2_t vcvt_s32_f32(float32x2_t __a) { +__funline int32x2_t vcvt_s32_f32(float32x2_t __a) { return __builtin_aarch64_lbtruncv2sfv2si(__a); } -FUNK uint32x2_t vcvt_u32_f32(float32x2_t __a) { +__funline uint32x2_t vcvt_u32_f32(float32x2_t __a) { return __builtin_aarch64_lbtruncuv2sfv2si_us(__a); } -FUNK int32x4_t vcvtq_s32_f32(float32x4_t __a) { +__funline int32x4_t vcvtq_s32_f32(float32x4_t __a) { return __builtin_aarch64_lbtruncv4sfv4si(__a); } -FUNK uint32x4_t vcvtq_u32_f32(float32x4_t __a) { +__funline uint32x4_t vcvtq_u32_f32(float32x4_t __a) { return __builtin_aarch64_lbtruncuv4sfv4si_us(__a); } -FUNK int64x1_t vcvt_s64_f64(float64x1_t __a) { +__funline int64x1_t vcvt_s64_f64(float64x1_t __a) { return (int64x1_t){vcvtd_s64_f64(__a[0])}; } -FUNK uint64x1_t vcvt_u64_f64(float64x1_t __a) { +__funline uint64x1_t vcvt_u64_f64(float64x1_t __a) { return (uint64x1_t){vcvtd_u64_f64(__a[0])}; } -FUNK int64x2_t vcvtq_s64_f64(float64x2_t __a) { +__funline int64x2_t vcvtq_s64_f64(float64x2_t __a) { return __builtin_aarch64_lbtruncv2dfv2di(__a); } -FUNK uint64x2_t vcvtq_u64_f64(float64x2_t __a) { +__funline uint64x2_t vcvtq_u64_f64(float64x2_t __a) { return __builtin_aarch64_lbtruncuv2dfv2di_us(__a); } -FUNK int64_t vcvtad_s64_f64(float64_t __a) { +__funline int64_t vcvtad_s64_f64(float64_t __a) { return __builtin_aarch64_lrounddfdi(__a); } -FUNK uint64_t vcvtad_u64_f64(float64_t __a) { +__funline uint64_t vcvtad_u64_f64(float64_t __a) { return __builtin_aarch64_lroundudfdi_us(__a); } -FUNK int32_t vcvtas_s32_f32(float32_t __a) { +__funline int32_t vcvtas_s32_f32(float32_t __a) { return __builtin_aarch64_lroundsfsi(__a); } -FUNK uint32_t vcvtas_u32_f32(float32_t __a) { +__funline uint32_t vcvtas_u32_f32(float32_t __a) { return __builtin_aarch64_lroundusfsi_us(__a); } -FUNK int32x2_t vcvta_s32_f32(float32x2_t __a) { +__funline int32x2_t vcvta_s32_f32(float32x2_t __a) { return __builtin_aarch64_lroundv2sfv2si(__a); } -FUNK uint32x2_t vcvta_u32_f32(float32x2_t __a) { +__funline uint32x2_t vcvta_u32_f32(float32x2_t __a) { return __builtin_aarch64_lrounduv2sfv2si_us(__a); } -FUNK int32x4_t vcvtaq_s32_f32(float32x4_t __a) { +__funline int32x4_t vcvtaq_s32_f32(float32x4_t __a) { return __builtin_aarch64_lroundv4sfv4si(__a); } -FUNK uint32x4_t vcvtaq_u32_f32(float32x4_t __a) { +__funline uint32x4_t vcvtaq_u32_f32(float32x4_t __a) { return __builtin_aarch64_lrounduv4sfv4si_us(__a); } -FUNK int64x1_t vcvta_s64_f64(float64x1_t __a) { +__funline int64x1_t vcvta_s64_f64(float64x1_t __a) { return (int64x1_t){vcvtad_s64_f64(__a[0])}; } -FUNK uint64x1_t vcvta_u64_f64(float64x1_t __a) { +__funline uint64x1_t vcvta_u64_f64(float64x1_t __a) { return (uint64x1_t){vcvtad_u64_f64(__a[0])}; } -FUNK int64x2_t vcvtaq_s64_f64(float64x2_t __a) { +__funline int64x2_t vcvtaq_s64_f64(float64x2_t __a) { return __builtin_aarch64_lroundv2dfv2di(__a); } -FUNK uint64x2_t vcvtaq_u64_f64(float64x2_t __a) { +__funline uint64x2_t vcvtaq_u64_f64(float64x2_t __a) { return __builtin_aarch64_lrounduv2dfv2di_us(__a); } -FUNK int64_t vcvtmd_s64_f64(float64_t __a) { +__funline int64_t vcvtmd_s64_f64(float64_t __a) { return __builtin_llfloor(__a); } -FUNK uint64_t vcvtmd_u64_f64(float64_t __a) { +__funline uint64_t vcvtmd_u64_f64(float64_t __a) { return __builtin_aarch64_lfloorudfdi_us(__a); } -FUNK int32_t vcvtms_s32_f32(float32_t __a) { +__funline int32_t vcvtms_s32_f32(float32_t __a) { return __builtin_ifloorf(__a); } -FUNK uint32_t vcvtms_u32_f32(float32_t __a) { +__funline uint32_t vcvtms_u32_f32(float32_t __a) { return __builtin_aarch64_lfloorusfsi_us(__a); } -FUNK int32x2_t vcvtm_s32_f32(float32x2_t __a) { +__funline int32x2_t vcvtm_s32_f32(float32x2_t __a) { return __builtin_aarch64_lfloorv2sfv2si(__a); } -FUNK uint32x2_t vcvtm_u32_f32(float32x2_t __a) { +__funline uint32x2_t vcvtm_u32_f32(float32x2_t __a) { return __builtin_aarch64_lflooruv2sfv2si_us(__a); } -FUNK int32x4_t vcvtmq_s32_f32(float32x4_t __a) { +__funline int32x4_t vcvtmq_s32_f32(float32x4_t __a) { return __builtin_aarch64_lfloorv4sfv4si(__a); } -FUNK uint32x4_t vcvtmq_u32_f32(float32x4_t __a) { +__funline uint32x4_t vcvtmq_u32_f32(float32x4_t __a) { return __builtin_aarch64_lflooruv4sfv4si_us(__a); } -FUNK int64x1_t vcvtm_s64_f64(float64x1_t __a) { +__funline int64x1_t vcvtm_s64_f64(float64x1_t __a) { return (int64x1_t){vcvtmd_s64_f64(__a[0])}; } -FUNK uint64x1_t vcvtm_u64_f64(float64x1_t __a) { +__funline uint64x1_t vcvtm_u64_f64(float64x1_t __a) { return (uint64x1_t){vcvtmd_u64_f64(__a[0])}; } -FUNK int64x2_t vcvtmq_s64_f64(float64x2_t __a) { +__funline int64x2_t vcvtmq_s64_f64(float64x2_t __a) { return __builtin_aarch64_lfloorv2dfv2di(__a); } -FUNK uint64x2_t vcvtmq_u64_f64(float64x2_t __a) { +__funline uint64x2_t vcvtmq_u64_f64(float64x2_t __a) { return __builtin_aarch64_lflooruv2dfv2di_us(__a); } -FUNK int64_t vcvtnd_s64_f64(float64_t __a) { +__funline int64_t vcvtnd_s64_f64(float64_t __a) { return __builtin_aarch64_lfrintndfdi(__a); } -FUNK uint64_t vcvtnd_u64_f64(float64_t __a) { +__funline uint64_t vcvtnd_u64_f64(float64_t __a) { return __builtin_aarch64_lfrintnudfdi_us(__a); } -FUNK int32_t vcvtns_s32_f32(float32_t __a) { +__funline int32_t vcvtns_s32_f32(float32_t __a) { return __builtin_aarch64_lfrintnsfsi(__a); } -FUNK uint32_t vcvtns_u32_f32(float32_t __a) { +__funline uint32_t vcvtns_u32_f32(float32_t __a) { return __builtin_aarch64_lfrintnusfsi_us(__a); } -FUNK int32x2_t vcvtn_s32_f32(float32x2_t __a) { +__funline int32x2_t vcvtn_s32_f32(float32x2_t __a) { return __builtin_aarch64_lfrintnv2sfv2si(__a); } -FUNK uint32x2_t vcvtn_u32_f32(float32x2_t __a) { +__funline uint32x2_t vcvtn_u32_f32(float32x2_t __a) { return __builtin_aarch64_lfrintnuv2sfv2si_us(__a); } -FUNK int32x4_t vcvtnq_s32_f32(float32x4_t __a) { +__funline int32x4_t vcvtnq_s32_f32(float32x4_t __a) { return __builtin_aarch64_lfrintnv4sfv4si(__a); } -FUNK uint32x4_t vcvtnq_u32_f32(float32x4_t __a) { +__funline uint32x4_t vcvtnq_u32_f32(float32x4_t __a) { return __builtin_aarch64_lfrintnuv4sfv4si_us(__a); } -FUNK int64x1_t vcvtn_s64_f64(float64x1_t __a) { +__funline int64x1_t vcvtn_s64_f64(float64x1_t __a) { return (int64x1_t){vcvtnd_s64_f64(__a[0])}; } -FUNK uint64x1_t vcvtn_u64_f64(float64x1_t __a) { +__funline uint64x1_t vcvtn_u64_f64(float64x1_t __a) { return (uint64x1_t){vcvtnd_u64_f64(__a[0])}; } -FUNK int64x2_t vcvtnq_s64_f64(float64x2_t __a) { +__funline int64x2_t vcvtnq_s64_f64(float64x2_t __a) { return __builtin_aarch64_lfrintnv2dfv2di(__a); } -FUNK uint64x2_t vcvtnq_u64_f64(float64x2_t __a) { +__funline uint64x2_t vcvtnq_u64_f64(float64x2_t __a) { return __builtin_aarch64_lfrintnuv2dfv2di_us(__a); } -FUNK int64_t vcvtpd_s64_f64(float64_t __a) { +__funline int64_t vcvtpd_s64_f64(float64_t __a) { return __builtin_llceil(__a); } -FUNK uint64_t vcvtpd_u64_f64(float64_t __a) { +__funline uint64_t vcvtpd_u64_f64(float64_t __a) { return __builtin_aarch64_lceiludfdi_us(__a); } -FUNK int32_t vcvtps_s32_f32(float32_t __a) { +__funline int32_t vcvtps_s32_f32(float32_t __a) { return __builtin_iceilf(__a); } -FUNK uint32_t vcvtps_u32_f32(float32_t __a) { +__funline uint32_t vcvtps_u32_f32(float32_t __a) { return __builtin_aarch64_lceilusfsi_us(__a); } -FUNK int32x2_t vcvtp_s32_f32(float32x2_t __a) { +__funline int32x2_t vcvtp_s32_f32(float32x2_t __a) { return __builtin_aarch64_lceilv2sfv2si(__a); } -FUNK uint32x2_t vcvtp_u32_f32(float32x2_t __a) { +__funline uint32x2_t vcvtp_u32_f32(float32x2_t __a) { return __builtin_aarch64_lceiluv2sfv2si_us(__a); } -FUNK int32x4_t vcvtpq_s32_f32(float32x4_t __a) { +__funline int32x4_t vcvtpq_s32_f32(float32x4_t __a) { return __builtin_aarch64_lceilv4sfv4si(__a); } -FUNK uint32x4_t vcvtpq_u32_f32(float32x4_t __a) { +__funline uint32x4_t vcvtpq_u32_f32(float32x4_t __a) { return __builtin_aarch64_lceiluv4sfv4si_us(__a); } -FUNK int64x1_t vcvtp_s64_f64(float64x1_t __a) { +__funline int64x1_t vcvtp_s64_f64(float64x1_t __a) { return (int64x1_t){vcvtpd_s64_f64(__a[0])}; } -FUNK uint64x1_t vcvtp_u64_f64(float64x1_t __a) { +__funline uint64x1_t vcvtp_u64_f64(float64x1_t __a) { return (uint64x1_t){vcvtpd_u64_f64(__a[0])}; } -FUNK int64x2_t vcvtpq_s64_f64(float64x2_t __a) { +__funline int64x2_t vcvtpq_s64_f64(float64x2_t __a) { return __builtin_aarch64_lceilv2dfv2di(__a); } -FUNK uint64x2_t vcvtpq_u64_f64(float64x2_t __a) { +__funline uint64x2_t vcvtpq_u64_f64(float64x2_t __a) { return __builtin_aarch64_lceiluv2dfv2di_us(__a); } -FUNK float16x4_t vdup_n_f16(float16_t __a) { +__funline float16x4_t vdup_n_f16(float16_t __a) { return (float16x4_t){__a, __a, __a, __a}; } -FUNK float32x2_t vdup_n_f32(float32_t __a) { +__funline float32x2_t vdup_n_f32(float32_t __a) { return (float32x2_t){__a, __a}; } -FUNK float64x1_t vdup_n_f64(float64_t __a) { +__funline float64x1_t vdup_n_f64(float64_t __a) { return (float64x1_t){__a}; } -FUNK poly8x8_t vdup_n_p8(poly8_t __a) { +__funline poly8x8_t vdup_n_p8(poly8_t __a) { return (poly8x8_t){__a, __a, __a, __a, __a, __a, __a, __a}; } -FUNK poly16x4_t vdup_n_p16(poly16_t __a) { +__funline poly16x4_t vdup_n_p16(poly16_t __a) { return (poly16x4_t){__a, __a, __a, __a}; } -FUNK poly64x1_t vdup_n_p64(poly64_t __a) { +__funline poly64x1_t vdup_n_p64(poly64_t __a) { return (poly64x1_t){__a}; } -FUNK int8x8_t vdup_n_s8(int8_t __a) { +__funline int8x8_t vdup_n_s8(int8_t __a) { return (int8x8_t){__a, __a, __a, __a, __a, __a, __a, __a}; } -FUNK int16x4_t vdup_n_s16(int16_t __a) { +__funline int16x4_t vdup_n_s16(int16_t __a) { return (int16x4_t){__a, __a, __a, __a}; } -FUNK int32x2_t vdup_n_s32(int32_t __a) { +__funline int32x2_t vdup_n_s32(int32_t __a) { return (int32x2_t){__a, __a}; } -FUNK int64x1_t vdup_n_s64(int64_t __a) { +__funline int64x1_t vdup_n_s64(int64_t __a) { return (int64x1_t){__a}; } -FUNK uint8x8_t vdup_n_u8(uint8_t __a) { +__funline uint8x8_t vdup_n_u8(uint8_t __a) { return (uint8x8_t){__a, __a, __a, __a, __a, __a, __a, __a}; } -FUNK uint16x4_t vdup_n_u16(uint16_t __a) { +__funline uint16x4_t vdup_n_u16(uint16_t __a) { return (uint16x4_t){__a, __a, __a, __a}; } -FUNK uint32x2_t vdup_n_u32(uint32_t __a) { +__funline uint32x2_t vdup_n_u32(uint32_t __a) { return (uint32x2_t){__a, __a}; } -FUNK uint64x1_t vdup_n_u64(uint64_t __a) { +__funline uint64x1_t vdup_n_u64(uint64_t __a) { return (uint64x1_t){__a}; } -FUNK float16x8_t vdupq_n_f16(float16_t __a) { +__funline float16x8_t vdupq_n_f16(float16_t __a) { return (float16x8_t){__a, __a, __a, __a, __a, __a, __a, __a}; } -FUNK float32x4_t vdupq_n_f32(float32_t __a) { +__funline float32x4_t vdupq_n_f32(float32_t __a) { return (float32x4_t){__a, __a, __a, __a}; } -FUNK float64x2_t vdupq_n_f64(float64_t __a) { +__funline float64x2_t vdupq_n_f64(float64_t __a) { return (float64x2_t){__a, __a}; } -FUNK poly8x16_t vdupq_n_p8(uint32_t __a) { +__funline poly8x16_t vdupq_n_p8(uint32_t __a) { return (poly8x16_t){__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; } -FUNK poly16x8_t vdupq_n_p16(uint32_t __a) { +__funline poly16x8_t vdupq_n_p16(uint32_t __a) { return (poly16x8_t){__a, __a, __a, __a, __a, __a, __a, __a}; } -FUNK poly64x2_t vdupq_n_p64(uint64_t __a) { +__funline poly64x2_t vdupq_n_p64(uint64_t __a) { return (poly64x2_t){__a, __a}; } -FUNK int8x16_t vdupq_n_s8(int32_t __a) { +__funline int8x16_t vdupq_n_s8(int32_t __a) { return (int8x16_t){__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; } -FUNK int16x8_t vdupq_n_s16(int32_t __a) { +__funline int16x8_t vdupq_n_s16(int32_t __a) { return (int16x8_t){__a, __a, __a, __a, __a, __a, __a, __a}; } -FUNK int32x4_t vdupq_n_s32(int32_t __a) { +__funline int32x4_t vdupq_n_s32(int32_t __a) { return (int32x4_t){__a, __a, __a, __a}; } -FUNK int64x2_t vdupq_n_s64(int64_t __a) { +__funline int64x2_t vdupq_n_s64(int64_t __a) { return (int64x2_t){__a, __a}; } -FUNK uint8x16_t vdupq_n_u8(uint32_t __a) { +__funline uint8x16_t vdupq_n_u8(uint32_t __a) { return (uint8x16_t){__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; } -FUNK uint16x8_t vdupq_n_u16(uint32_t __a) { +__funline uint16x8_t vdupq_n_u16(uint32_t __a) { return (uint16x8_t){__a, __a, __a, __a, __a, __a, __a, __a}; } -FUNK uint32x4_t vdupq_n_u32(uint32_t __a) { +__funline uint32x4_t vdupq_n_u32(uint32_t __a) { return (uint32x4_t){__a, __a, __a, __a}; } -FUNK uint64x2_t vdupq_n_u64(uint64_t __a) { +__funline uint64x2_t vdupq_n_u64(uint64_t __a) { return (uint64x2_t){__a, __a}; } -FUNK float16x4_t vdup_lane_f16(float16x4_t __a, const int __b) { +__funline float16x4_t vdup_lane_f16(float16x4_t __a, const int __b) { return __aarch64_vdup_lane_f16(__a, __b); } -FUNK float32x2_t vdup_lane_f32(float32x2_t __a, const int __b) { +__funline float32x2_t vdup_lane_f32(float32x2_t __a, const int __b) { return __aarch64_vdup_lane_f32(__a, __b); } -FUNK float64x1_t vdup_lane_f64(float64x1_t __a, const int __b) { +__funline float64x1_t vdup_lane_f64(float64x1_t __a, const int __b) { return __aarch64_vdup_lane_f64(__a, __b); } -FUNK poly8x8_t vdup_lane_p8(poly8x8_t __a, const int __b) { +__funline poly8x8_t vdup_lane_p8(poly8x8_t __a, const int __b) { return __aarch64_vdup_lane_p8(__a, __b); } -FUNK poly16x4_t vdup_lane_p16(poly16x4_t __a, const int __b) { +__funline poly16x4_t vdup_lane_p16(poly16x4_t __a, const int __b) { return __aarch64_vdup_lane_p16(__a, __b); } -FUNK poly64x1_t vdup_lane_p64(poly64x1_t __a, const int __b) { +__funline poly64x1_t vdup_lane_p64(poly64x1_t __a, const int __b) { return __aarch64_vdup_lane_p64(__a, __b); } -FUNK int8x8_t vdup_lane_s8(int8x8_t __a, const int __b) { +__funline int8x8_t vdup_lane_s8(int8x8_t __a, const int __b) { return __aarch64_vdup_lane_s8(__a, __b); } -FUNK int16x4_t vdup_lane_s16(int16x4_t __a, const int __b) { +__funline int16x4_t vdup_lane_s16(int16x4_t __a, const int __b) { return __aarch64_vdup_lane_s16(__a, __b); } -FUNK int32x2_t vdup_lane_s32(int32x2_t __a, const int __b) { +__funline int32x2_t vdup_lane_s32(int32x2_t __a, const int __b) { return __aarch64_vdup_lane_s32(__a, __b); } -FUNK int64x1_t vdup_lane_s64(int64x1_t __a, const int __b) { +__funline int64x1_t vdup_lane_s64(int64x1_t __a, const int __b) { return __aarch64_vdup_lane_s64(__a, __b); } -FUNK uint8x8_t vdup_lane_u8(uint8x8_t __a, const int __b) { +__funline uint8x8_t vdup_lane_u8(uint8x8_t __a, const int __b) { return __aarch64_vdup_lane_u8(__a, __b); } -FUNK uint16x4_t vdup_lane_u16(uint16x4_t __a, const int __b) { +__funline uint16x4_t vdup_lane_u16(uint16x4_t __a, const int __b) { return __aarch64_vdup_lane_u16(__a, __b); } -FUNK uint32x2_t vdup_lane_u32(uint32x2_t __a, const int __b) { +__funline uint32x2_t vdup_lane_u32(uint32x2_t __a, const int __b) { return __aarch64_vdup_lane_u32(__a, __b); } -FUNK uint64x1_t vdup_lane_u64(uint64x1_t __a, const int __b) { +__funline uint64x1_t vdup_lane_u64(uint64x1_t __a, const int __b) { return __aarch64_vdup_lane_u64(__a, __b); } -FUNK float16x4_t vdup_laneq_f16(float16x8_t __a, const int __b) { +__funline float16x4_t vdup_laneq_f16(float16x8_t __a, const int __b) { return __aarch64_vdup_laneq_f16(__a, __b); } -FUNK float32x2_t vdup_laneq_f32(float32x4_t __a, const int __b) { +__funline float32x2_t vdup_laneq_f32(float32x4_t __a, const int __b) { return __aarch64_vdup_laneq_f32(__a, __b); } -FUNK float64x1_t vdup_laneq_f64(float64x2_t __a, const int __b) { +__funline float64x1_t vdup_laneq_f64(float64x2_t __a, const int __b) { return __aarch64_vdup_laneq_f64(__a, __b); } -FUNK poly8x8_t vdup_laneq_p8(poly8x16_t __a, const int __b) { +__funline poly8x8_t vdup_laneq_p8(poly8x16_t __a, const int __b) { return __aarch64_vdup_laneq_p8(__a, __b); } -FUNK poly16x4_t vdup_laneq_p16(poly16x8_t __a, const int __b) { +__funline poly16x4_t vdup_laneq_p16(poly16x8_t __a, const int __b) { return __aarch64_vdup_laneq_p16(__a, __b); } -FUNK poly64x1_t vdup_laneq_p64(poly64x2_t __a, const int __b) { +__funline poly64x1_t vdup_laneq_p64(poly64x2_t __a, const int __b) { return __aarch64_vdup_laneq_p64(__a, __b); } -FUNK int8x8_t vdup_laneq_s8(int8x16_t __a, const int __b) { +__funline int8x8_t vdup_laneq_s8(int8x16_t __a, const int __b) { return __aarch64_vdup_laneq_s8(__a, __b); } -FUNK int16x4_t vdup_laneq_s16(int16x8_t __a, const int __b) { +__funline int16x4_t vdup_laneq_s16(int16x8_t __a, const int __b) { return __aarch64_vdup_laneq_s16(__a, __b); } -FUNK int32x2_t vdup_laneq_s32(int32x4_t __a, const int __b) { +__funline int32x2_t vdup_laneq_s32(int32x4_t __a, const int __b) { return __aarch64_vdup_laneq_s32(__a, __b); } -FUNK int64x1_t vdup_laneq_s64(int64x2_t __a, const int __b) { +__funline int64x1_t vdup_laneq_s64(int64x2_t __a, const int __b) { return __aarch64_vdup_laneq_s64(__a, __b); } -FUNK uint8x8_t vdup_laneq_u8(uint8x16_t __a, const int __b) { +__funline uint8x8_t vdup_laneq_u8(uint8x16_t __a, const int __b) { return __aarch64_vdup_laneq_u8(__a, __b); } -FUNK uint16x4_t vdup_laneq_u16(uint16x8_t __a, const int __b) { +__funline uint16x4_t vdup_laneq_u16(uint16x8_t __a, const int __b) { return __aarch64_vdup_laneq_u16(__a, __b); } -FUNK uint32x2_t vdup_laneq_u32(uint32x4_t __a, const int __b) { +__funline uint32x2_t vdup_laneq_u32(uint32x4_t __a, const int __b) { return __aarch64_vdup_laneq_u32(__a, __b); } -FUNK uint64x1_t vdup_laneq_u64(uint64x2_t __a, const int __b) { +__funline uint64x1_t vdup_laneq_u64(uint64x2_t __a, const int __b) { return __aarch64_vdup_laneq_u64(__a, __b); } -FUNK float16x8_t vdupq_lane_f16(float16x4_t __a, const int __b) { +__funline float16x8_t vdupq_lane_f16(float16x4_t __a, const int __b) { return __aarch64_vdupq_lane_f16(__a, __b); } -FUNK float32x4_t vdupq_lane_f32(float32x2_t __a, const int __b) { +__funline float32x4_t vdupq_lane_f32(float32x2_t __a, const int __b) { return __aarch64_vdupq_lane_f32(__a, __b); } -FUNK float64x2_t vdupq_lane_f64(float64x1_t __a, const int __b) { +__funline float64x2_t vdupq_lane_f64(float64x1_t __a, const int __b) { return __aarch64_vdupq_lane_f64(__a, __b); } -FUNK poly8x16_t vdupq_lane_p8(poly8x8_t __a, const int __b) { +__funline poly8x16_t vdupq_lane_p8(poly8x8_t __a, const int __b) { return __aarch64_vdupq_lane_p8(__a, __b); } -FUNK poly16x8_t vdupq_lane_p16(poly16x4_t __a, const int __b) { +__funline poly16x8_t vdupq_lane_p16(poly16x4_t __a, const int __b) { return __aarch64_vdupq_lane_p16(__a, __b); } -FUNK poly64x2_t vdupq_lane_p64(poly64x1_t __a, const int __b) { +__funline poly64x2_t vdupq_lane_p64(poly64x1_t __a, const int __b) { return __aarch64_vdupq_lane_p64(__a, __b); } -FUNK int8x16_t vdupq_lane_s8(int8x8_t __a, const int __b) { +__funline int8x16_t vdupq_lane_s8(int8x8_t __a, const int __b) { return __aarch64_vdupq_lane_s8(__a, __b); } -FUNK int16x8_t vdupq_lane_s16(int16x4_t __a, const int __b) { +__funline int16x8_t vdupq_lane_s16(int16x4_t __a, const int __b) { return __aarch64_vdupq_lane_s16(__a, __b); } -FUNK int32x4_t vdupq_lane_s32(int32x2_t __a, const int __b) { +__funline int32x4_t vdupq_lane_s32(int32x2_t __a, const int __b) { return __aarch64_vdupq_lane_s32(__a, __b); } -FUNK int64x2_t vdupq_lane_s64(int64x1_t __a, const int __b) { +__funline int64x2_t vdupq_lane_s64(int64x1_t __a, const int __b) { return __aarch64_vdupq_lane_s64(__a, __b); } -FUNK uint8x16_t vdupq_lane_u8(uint8x8_t __a, const int __b) { +__funline uint8x16_t vdupq_lane_u8(uint8x8_t __a, const int __b) { return __aarch64_vdupq_lane_u8(__a, __b); } -FUNK uint16x8_t vdupq_lane_u16(uint16x4_t __a, const int __b) { +__funline uint16x8_t vdupq_lane_u16(uint16x4_t __a, const int __b) { return __aarch64_vdupq_lane_u16(__a, __b); } -FUNK uint32x4_t vdupq_lane_u32(uint32x2_t __a, const int __b) { +__funline uint32x4_t vdupq_lane_u32(uint32x2_t __a, const int __b) { return __aarch64_vdupq_lane_u32(__a, __b); } -FUNK uint64x2_t vdupq_lane_u64(uint64x1_t __a, const int __b) { +__funline uint64x2_t vdupq_lane_u64(uint64x1_t __a, const int __b) { return __aarch64_vdupq_lane_u64(__a, __b); } -FUNK float16x8_t vdupq_laneq_f16(float16x8_t __a, const int __b) { +__funline float16x8_t vdupq_laneq_f16(float16x8_t __a, const int __b) { return __aarch64_vdupq_laneq_f16(__a, __b); } -FUNK float32x4_t vdupq_laneq_f32(float32x4_t __a, const int __b) { +__funline float32x4_t vdupq_laneq_f32(float32x4_t __a, const int __b) { return __aarch64_vdupq_laneq_f32(__a, __b); } -FUNK float64x2_t vdupq_laneq_f64(float64x2_t __a, const int __b) { +__funline float64x2_t vdupq_laneq_f64(float64x2_t __a, const int __b) { return __aarch64_vdupq_laneq_f64(__a, __b); } -FUNK poly8x16_t vdupq_laneq_p8(poly8x16_t __a, const int __b) { +__funline poly8x16_t vdupq_laneq_p8(poly8x16_t __a, const int __b) { return __aarch64_vdupq_laneq_p8(__a, __b); } -FUNK poly16x8_t vdupq_laneq_p16(poly16x8_t __a, const int __b) { +__funline poly16x8_t vdupq_laneq_p16(poly16x8_t __a, const int __b) { return __aarch64_vdupq_laneq_p16(__a, __b); } -FUNK poly64x2_t vdupq_laneq_p64(poly64x2_t __a, const int __b) { +__funline poly64x2_t vdupq_laneq_p64(poly64x2_t __a, const int __b) { return __aarch64_vdupq_laneq_p64(__a, __b); } -FUNK int8x16_t vdupq_laneq_s8(int8x16_t __a, const int __b) { +__funline int8x16_t vdupq_laneq_s8(int8x16_t __a, const int __b) { return __aarch64_vdupq_laneq_s8(__a, __b); } -FUNK int16x8_t vdupq_laneq_s16(int16x8_t __a, const int __b) { +__funline int16x8_t vdupq_laneq_s16(int16x8_t __a, const int __b) { return __aarch64_vdupq_laneq_s16(__a, __b); } -FUNK int32x4_t vdupq_laneq_s32(int32x4_t __a, const int __b) { +__funline int32x4_t vdupq_laneq_s32(int32x4_t __a, const int __b) { return __aarch64_vdupq_laneq_s32(__a, __b); } -FUNK int64x2_t vdupq_laneq_s64(int64x2_t __a, const int __b) { +__funline int64x2_t vdupq_laneq_s64(int64x2_t __a, const int __b) { return __aarch64_vdupq_laneq_s64(__a, __b); } -FUNK uint8x16_t vdupq_laneq_u8(uint8x16_t __a, const int __b) { +__funline uint8x16_t vdupq_laneq_u8(uint8x16_t __a, const int __b) { return __aarch64_vdupq_laneq_u8(__a, __b); } -FUNK uint16x8_t vdupq_laneq_u16(uint16x8_t __a, const int __b) { +__funline uint16x8_t vdupq_laneq_u16(uint16x8_t __a, const int __b) { return __aarch64_vdupq_laneq_u16(__a, __b); } -FUNK uint32x4_t vdupq_laneq_u32(uint32x4_t __a, const int __b) { +__funline uint32x4_t vdupq_laneq_u32(uint32x4_t __a, const int __b) { return __aarch64_vdupq_laneq_u32(__a, __b); } -FUNK uint64x2_t vdupq_laneq_u64(uint64x2_t __a, const int __b) { +__funline uint64x2_t vdupq_laneq_u64(uint64x2_t __a, const int __b) { return __aarch64_vdupq_laneq_u64(__a, __b); } -FUNK poly8_t vdupb_lane_p8(poly8x8_t __a, const int __b) { +__funline poly8_t vdupb_lane_p8(poly8x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int8_t vdupb_lane_s8(int8x8_t __a, const int __b) { +__funline int8_t vdupb_lane_s8(int8x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint8_t vdupb_lane_u8(uint8x8_t __a, const int __b) { +__funline uint8_t vdupb_lane_u8(uint8x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float16_t vduph_lane_f16(float16x4_t __a, const int __b) { +__funline float16_t vduph_lane_f16(float16x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK poly16_t vduph_lane_p16(poly16x4_t __a, const int __b) { +__funline poly16_t vduph_lane_p16(poly16x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int16_t vduph_lane_s16(int16x4_t __a, const int __b) { +__funline int16_t vduph_lane_s16(int16x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint16_t vduph_lane_u16(uint16x4_t __a, const int __b) { +__funline uint16_t vduph_lane_u16(uint16x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float32_t vdups_lane_f32(float32x2_t __a, const int __b) { +__funline float32_t vdups_lane_f32(float32x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int32_t vdups_lane_s32(int32x2_t __a, const int __b) { +__funline int32_t vdups_lane_s32(int32x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint32_t vdups_lane_u32(uint32x2_t __a, const int __b) { +__funline uint32_t vdups_lane_u32(uint32x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float64_t vdupd_lane_f64(float64x1_t __a, const int __b) { +__funline float64_t vdupd_lane_f64(float64x1_t __a, const int __b) { __AARCH64_LANE_CHECK(__a, __b); return __a[0]; } -FUNK int64_t vdupd_lane_s64(int64x1_t __a, const int __b) { +__funline int64_t vdupd_lane_s64(int64x1_t __a, const int __b) { __AARCH64_LANE_CHECK(__a, __b); return __a[0]; } -FUNK uint64_t vdupd_lane_u64(uint64x1_t __a, const int __b) { +__funline uint64_t vdupd_lane_u64(uint64x1_t __a, const int __b) { __AARCH64_LANE_CHECK(__a, __b); return __a[0]; } -FUNK poly8_t vdupb_laneq_p8(poly8x16_t __a, const int __b) { +__funline poly8_t vdupb_laneq_p8(poly8x16_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int8_t vdupb_laneq_s8(int8x16_t __a, const int __b) { +__funline int8_t vdupb_laneq_s8(int8x16_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint8_t vdupb_laneq_u8(uint8x16_t __a, const int __b) { +__funline uint8_t vdupb_laneq_u8(uint8x16_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float16_t vduph_laneq_f16(float16x8_t __a, const int __b) { +__funline float16_t vduph_laneq_f16(float16x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK poly16_t vduph_laneq_p16(poly16x8_t __a, const int __b) { +__funline poly16_t vduph_laneq_p16(poly16x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int16_t vduph_laneq_s16(int16x8_t __a, const int __b) { +__funline int16_t vduph_laneq_s16(int16x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint16_t vduph_laneq_u16(uint16x8_t __a, const int __b) { +__funline uint16_t vduph_laneq_u16(uint16x8_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float32_t vdups_laneq_f32(float32x4_t __a, const int __b) { +__funline float32_t vdups_laneq_f32(float32x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int32_t vdups_laneq_s32(int32x4_t __a, const int __b) { +__funline int32_t vdups_laneq_s32(int32x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint32_t vdups_laneq_u32(uint32x4_t __a, const int __b) { +__funline uint32_t vdups_laneq_u32(uint32x4_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float64_t vdupd_laneq_f64(float64x2_t __a, const int __b) { +__funline float64_t vdupd_laneq_f64(float64x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK int64_t vdupd_laneq_s64(int64x2_t __a, const int __b) { +__funline int64_t vdupd_laneq_s64(int64x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK uint64_t vdupd_laneq_u64(uint64x2_t __a, const int __b) { +__funline uint64_t vdupd_laneq_u64(uint64x2_t __a, const int __b) { return __aarch64_vget_lane_any(__a, __b); } -FUNK float16x4_t vext_f16(float16x4_t __a, float16x4_t __b, __const int __c) { +__funline float16x4_t vext_f16(float16x4_t __a, float16x4_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, @@ -10502,7 +10526,8 @@ FUNK float16x4_t vext_f16(float16x4_t __a, float16x4_t __b, __const int __c) { #endif } -FUNK float32x2_t vext_f32(float32x2_t __a, float32x2_t __b, __const int __c) { +__funline float32x2_t vext_f32(float32x2_t __a, float32x2_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, (uint32x2_t){2 - __c, 3 - __c}); @@ -10511,12 +10536,13 @@ FUNK float32x2_t vext_f32(float32x2_t __a, float32x2_t __b, __const int __c) { #endif } -FUNK float64x1_t vext_f64(float64x1_t __a, float64x1_t __b, __const int __c) { +__funline float64x1_t vext_f64(float64x1_t __a, float64x1_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); return __a; } -FUNK poly8x8_t vext_p8(poly8x8_t __a, poly8x8_t __b, __const int __c) { +__funline poly8x8_t vext_p8(poly8x8_t __a, poly8x8_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, @@ -10529,7 +10555,7 @@ FUNK poly8x8_t vext_p8(poly8x8_t __a, poly8x8_t __b, __const int __c) { #endif } -FUNK poly16x4_t vext_p16(poly16x4_t __a, poly16x4_t __b, __const int __c) { +__funline poly16x4_t vext_p16(poly16x4_t __a, poly16x4_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, @@ -10540,13 +10566,13 @@ FUNK poly16x4_t vext_p16(poly16x4_t __a, poly16x4_t __b, __const int __c) { #endif } -FUNK poly64x1_t vext_p64(poly64x1_t __a, poly64x1_t __b, __const int __c) { +__funline poly64x1_t vext_p64(poly64x1_t __a, poly64x1_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); return __a; } -FUNK int8x8_t vext_s8(int8x8_t __a, int8x8_t __b, __const int __c) { +__funline int8x8_t vext_s8(int8x8_t __a, int8x8_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, @@ -10559,7 +10585,7 @@ FUNK int8x8_t vext_s8(int8x8_t __a, int8x8_t __b, __const int __c) { #endif } -FUNK int16x4_t vext_s16(int16x4_t __a, int16x4_t __b, __const int __c) { +__funline int16x4_t vext_s16(int16x4_t __a, int16x4_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, @@ -10570,7 +10596,7 @@ FUNK int16x4_t vext_s16(int16x4_t __a, int16x4_t __b, __const int __c) { #endif } -FUNK int32x2_t vext_s32(int32x2_t __a, int32x2_t __b, __const int __c) { +__funline int32x2_t vext_s32(int32x2_t __a, int32x2_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, (uint32x2_t){2 - __c, 3 - __c}); @@ -10579,13 +10605,13 @@ FUNK int32x2_t vext_s32(int32x2_t __a, int32x2_t __b, __const int __c) { #endif } -FUNK int64x1_t vext_s64(int64x1_t __a, int64x1_t __b, __const int __c) { +__funline int64x1_t vext_s64(int64x1_t __a, int64x1_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); return __a; } -FUNK uint8x8_t vext_u8(uint8x8_t __a, uint8x8_t __b, __const int __c) { +__funline uint8x8_t vext_u8(uint8x8_t __a, uint8x8_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, @@ -10598,7 +10624,7 @@ FUNK uint8x8_t vext_u8(uint8x8_t __a, uint8x8_t __b, __const int __c) { #endif } -FUNK uint16x4_t vext_u16(uint16x4_t __a, uint16x4_t __b, __const int __c) { +__funline uint16x4_t vext_u16(uint16x4_t __a, uint16x4_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, @@ -10609,7 +10635,7 @@ FUNK uint16x4_t vext_u16(uint16x4_t __a, uint16x4_t __b, __const int __c) { #endif } -FUNK uint32x2_t vext_u32(uint32x2_t __a, uint32x2_t __b, __const int __c) { +__funline uint32x2_t vext_u32(uint32x2_t __a, uint32x2_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, (uint32x2_t){2 - __c, 3 - __c}); @@ -10618,13 +10644,14 @@ FUNK uint32x2_t vext_u32(uint32x2_t __a, uint32x2_t __b, __const int __c) { #endif } -FUNK uint64x1_t vext_u64(uint64x1_t __a, uint64x1_t __b, __const int __c) { +__funline uint64x1_t vext_u64(uint64x1_t __a, uint64x1_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); return __a; } -FUNK float16x8_t vextq_f16(float16x8_t __a, float16x8_t __b, __const int __c) { +__funline float16x8_t vextq_f16(float16x8_t __a, float16x8_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle( @@ -10638,7 +10665,8 @@ FUNK float16x8_t vextq_f16(float16x8_t __a, float16x8_t __b, __const int __c) { #endif } -FUNK float32x4_t vextq_f32(float32x4_t __a, float32x4_t __b, __const int __c) { +__funline float32x4_t vextq_f32(float32x4_t __a, float32x4_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, @@ -10649,7 +10677,8 @@ FUNK float32x4_t vextq_f32(float32x4_t __a, float32x4_t __b, __const int __c) { #endif } -FUNK float64x2_t vextq_f64(float64x2_t __a, float64x2_t __b, __const int __c) { +__funline float64x2_t vextq_f64(float64x2_t __a, float64x2_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, (uint64x2_t){2 - __c, 3 - __c}); @@ -10658,7 +10687,7 @@ FUNK float64x2_t vextq_f64(float64x2_t __a, float64x2_t __b, __const int __c) { #endif } -FUNK poly8x16_t vextq_p8(poly8x16_t __a, poly8x16_t __b, __const int __c) { +__funline poly8x16_t vextq_p8(poly8x16_t __a, poly8x16_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle( @@ -10675,7 +10704,8 @@ FUNK poly8x16_t vextq_p8(poly8x16_t __a, poly8x16_t __b, __const int __c) { #endif } -FUNK poly16x8_t vextq_p16(poly16x8_t __a, poly16x8_t __b, __const int __c) { +__funline poly16x8_t vextq_p16(poly16x8_t __a, poly16x8_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle( @@ -10689,7 +10719,8 @@ FUNK poly16x8_t vextq_p16(poly16x8_t __a, poly16x8_t __b, __const int __c) { #endif } -FUNK poly64x2_t vextq_p64(poly64x2_t __a, poly64x2_t __b, __const int __c) { +__funline poly64x2_t vextq_p64(poly64x2_t __a, poly64x2_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, (uint64x2_t){2 - __c, 3 - __c}); @@ -10698,7 +10729,7 @@ FUNK poly64x2_t vextq_p64(poly64x2_t __a, poly64x2_t __b, __const int __c) { #endif } -FUNK int8x16_t vextq_s8(int8x16_t __a, int8x16_t __b, __const int __c) { +__funline int8x16_t vextq_s8(int8x16_t __a, int8x16_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle( @@ -10715,7 +10746,7 @@ FUNK int8x16_t vextq_s8(int8x16_t __a, int8x16_t __b, __const int __c) { #endif } -FUNK int16x8_t vextq_s16(int16x8_t __a, int16x8_t __b, __const int __c) { +__funline int16x8_t vextq_s16(int16x8_t __a, int16x8_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle( @@ -10729,7 +10760,7 @@ FUNK int16x8_t vextq_s16(int16x8_t __a, int16x8_t __b, __const int __c) { #endif } -FUNK int32x4_t vextq_s32(int32x4_t __a, int32x4_t __b, __const int __c) { +__funline int32x4_t vextq_s32(int32x4_t __a, int32x4_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, @@ -10740,7 +10771,7 @@ FUNK int32x4_t vextq_s32(int32x4_t __a, int32x4_t __b, __const int __c) { #endif } -FUNK int64x2_t vextq_s64(int64x2_t __a, int64x2_t __b, __const int __c) { +__funline int64x2_t vextq_s64(int64x2_t __a, int64x2_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, (uint64x2_t){2 - __c, 3 - __c}); @@ -10749,7 +10780,7 @@ FUNK int64x2_t vextq_s64(int64x2_t __a, int64x2_t __b, __const int __c) { #endif } -FUNK uint8x16_t vextq_u8(uint8x16_t __a, uint8x16_t __b, __const int __c) { +__funline uint8x16_t vextq_u8(uint8x16_t __a, uint8x16_t __b, __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle( @@ -10766,7 +10797,8 @@ FUNK uint8x16_t vextq_u8(uint8x16_t __a, uint8x16_t __b, __const int __c) { #endif } -FUNK uint16x8_t vextq_u16(uint16x8_t __a, uint16x8_t __b, __const int __c) { +__funline uint16x8_t vextq_u16(uint16x8_t __a, uint16x8_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle( @@ -10780,7 +10812,8 @@ FUNK uint16x8_t vextq_u16(uint16x8_t __a, uint16x8_t __b, __const int __c) { #endif } -FUNK uint32x4_t vextq_u32(uint32x4_t __a, uint32x4_t __b, __const int __c) { +__funline uint32x4_t vextq_u32(uint32x4_t __a, uint32x4_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, @@ -10791,7 +10824,8 @@ FUNK uint32x4_t vextq_u32(uint32x4_t __a, uint32x4_t __b, __const int __c) { #endif } -FUNK uint64x2_t vextq_u64(uint64x2_t __a, uint64x2_t __b, __const int __c) { +__funline uint64x2_t vextq_u64(uint64x2_t __a, uint64x2_t __b, + __const int __c) { __AARCH64_LANE_CHECK(__a, __c); #ifdef __AARCH64EB__ return __builtin_shuffle(__b, __a, (uint64x2_t){2 - __c, 3 - __c}); @@ -10800,264 +10834,280 @@ FUNK uint64x2_t vextq_u64(uint64x2_t __a, uint64x2_t __b, __const int __c) { #endif } -FUNK float64x1_t vfma_f64(float64x1_t __a, float64x1_t __b, float64x1_t __c) { +__funline float64x1_t vfma_f64(float64x1_t __a, float64x1_t __b, + float64x1_t __c) { return (float64x1_t){__builtin_fma(__b[0], __c[0], __a[0])}; } -FUNK float32x2_t vfma_f32(float32x2_t __a, float32x2_t __b, float32x2_t __c) { +__funline float32x2_t vfma_f32(float32x2_t __a, float32x2_t __b, + float32x2_t __c) { return __builtin_aarch64_fmav2sf(__b, __c, __a); } -FUNK float32x4_t vfmaq_f32(float32x4_t __a, float32x4_t __b, float32x4_t __c) { +__funline float32x4_t vfmaq_f32(float32x4_t __a, float32x4_t __b, + float32x4_t __c) { return __builtin_aarch64_fmav4sf(__b, __c, __a); } -FUNK float64x2_t vfmaq_f64(float64x2_t __a, float64x2_t __b, float64x2_t __c) { +__funline float64x2_t vfmaq_f64(float64x2_t __a, float64x2_t __b, + float64x2_t __c) { return __builtin_aarch64_fmav2df(__b, __c, __a); } -FUNK float32x2_t vfma_n_f32(float32x2_t __a, float32x2_t __b, float32_t __c) { +__funline float32x2_t vfma_n_f32(float32x2_t __a, float32x2_t __b, + float32_t __c) { return __builtin_aarch64_fmav2sf(__b, vdup_n_f32(__c), __a); } -FUNK float64x1_t vfma_n_f64(float64x1_t __a, float64x1_t __b, float64_t __c) { +__funline float64x1_t vfma_n_f64(float64x1_t __a, float64x1_t __b, + float64_t __c) { return (float64x1_t){__b[0] * __c + __a[0]}; } -FUNK float32x4_t vfmaq_n_f32(float32x4_t __a, float32x4_t __b, float32_t __c) { +__funline float32x4_t vfmaq_n_f32(float32x4_t __a, float32x4_t __b, + float32_t __c) { return __builtin_aarch64_fmav4sf(__b, vdupq_n_f32(__c), __a); } -FUNK float64x2_t vfmaq_n_f64(float64x2_t __a, float64x2_t __b, float64_t __c) { +__funline float64x2_t vfmaq_n_f64(float64x2_t __a, float64x2_t __b, + float64_t __c) { return __builtin_aarch64_fmav2df(__b, vdupq_n_f64(__c), __a); } -FUNK float32x2_t vfma_lane_f32(float32x2_t __a, float32x2_t __b, - float32x2_t __c, const int __lane) { +__funline float32x2_t vfma_lane_f32(float32x2_t __a, float32x2_t __b, + float32x2_t __c, const int __lane) { return __builtin_aarch64_fmav2sf(__b, __aarch64_vdup_lane_f32(__c, __lane), __a); } -FUNK float64x1_t vfma_lane_f64(float64x1_t __a, float64x1_t __b, - float64x1_t __c, const int __lane) { +__funline float64x1_t vfma_lane_f64(float64x1_t __a, float64x1_t __b, + float64x1_t __c, const int __lane) { return (float64x1_t){__builtin_fma(__b[0], __c[0], __a[0])}; } -FUNK float64_t vfmad_lane_f64(float64_t __a, float64_t __b, float64x1_t __c, - const int __lane) { +__funline float64_t vfmad_lane_f64(float64_t __a, float64_t __b, + float64x1_t __c, const int __lane) { return __builtin_fma(__b, __c[0], __a); } -FUNK float32_t vfmas_lane_f32(float32_t __a, float32_t __b, float32x2_t __c, - const int __lane) { +__funline float32_t vfmas_lane_f32(float32_t __a, float32_t __b, + float32x2_t __c, const int __lane) { return __builtin_fmaf(__b, __aarch64_vget_lane_any(__c, __lane), __a); } -FUNK float32x2_t vfma_laneq_f32(float32x2_t __a, float32x2_t __b, - float32x4_t __c, const int __lane) { +__funline float32x2_t vfma_laneq_f32(float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) { return __builtin_aarch64_fmav2sf(__b, __aarch64_vdup_laneq_f32(__c, __lane), __a); } -FUNK float64x1_t vfma_laneq_f64(float64x1_t __a, float64x1_t __b, - float64x2_t __c, const int __lane) { +__funline float64x1_t vfma_laneq_f64(float64x1_t __a, float64x1_t __b, + float64x2_t __c, const int __lane) { float64_t __c0 = __aarch64_vget_lane_any(__c, __lane); return (float64x1_t){__builtin_fma(__b[0], __c0, __a[0])}; } -FUNK float64_t vfmad_laneq_f64(float64_t __a, float64_t __b, float64x2_t __c, - const int __lane) { +__funline float64_t vfmad_laneq_f64(float64_t __a, float64_t __b, + float64x2_t __c, const int __lane) { return __builtin_fma(__b, __aarch64_vget_lane_any(__c, __lane), __a); } -FUNK float32_t vfmas_laneq_f32(float32_t __a, float32_t __b, float32x4_t __c, - const int __lane) { +__funline float32_t vfmas_laneq_f32(float32_t __a, float32_t __b, + float32x4_t __c, const int __lane) { return __builtin_fmaf(__b, __aarch64_vget_lane_any(__c, __lane), __a); } -FUNK float32x4_t vfmaq_lane_f32(float32x4_t __a, float32x4_t __b, - float32x2_t __c, const int __lane) { +__funline float32x4_t vfmaq_lane_f32(float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) { return __builtin_aarch64_fmav4sf(__b, __aarch64_vdupq_lane_f32(__c, __lane), __a); } -FUNK float64x2_t vfmaq_lane_f64(float64x2_t __a, float64x2_t __b, - float64x1_t __c, const int __lane) { +__funline float64x2_t vfmaq_lane_f64(float64x2_t __a, float64x2_t __b, + float64x1_t __c, const int __lane) { return __builtin_aarch64_fmav2df(__b, vdupq_n_f64(__c[0]), __a); } -FUNK float32x4_t vfmaq_laneq_f32(float32x4_t __a, float32x4_t __b, - float32x4_t __c, const int __lane) { +__funline float32x4_t vfmaq_laneq_f32(float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) { return __builtin_aarch64_fmav4sf(__b, __aarch64_vdupq_laneq_f32(__c, __lane), __a); } -FUNK float64x2_t vfmaq_laneq_f64(float64x2_t __a, float64x2_t __b, - float64x2_t __c, const int __lane) { +__funline float64x2_t vfmaq_laneq_f64(float64x2_t __a, float64x2_t __b, + float64x2_t __c, const int __lane) { return __builtin_aarch64_fmav2df(__b, __aarch64_vdupq_laneq_f64(__c, __lane), __a); } -FUNK float64x1_t vfms_f64(float64x1_t __a, float64x1_t __b, float64x1_t __c) { +__funline float64x1_t vfms_f64(float64x1_t __a, float64x1_t __b, + float64x1_t __c) { return (float64x1_t){__builtin_fma(-__b[0], __c[0], __a[0])}; } -FUNK float32x2_t vfms_f32(float32x2_t __a, float32x2_t __b, float32x2_t __c) { +__funline float32x2_t vfms_f32(float32x2_t __a, float32x2_t __b, + float32x2_t __c) { return __builtin_aarch64_fmav2sf(-__b, __c, __a); } -FUNK float32x4_t vfmsq_f32(float32x4_t __a, float32x4_t __b, float32x4_t __c) { +__funline float32x4_t vfmsq_f32(float32x4_t __a, float32x4_t __b, + float32x4_t __c) { return __builtin_aarch64_fmav4sf(-__b, __c, __a); } -FUNK float64x2_t vfmsq_f64(float64x2_t __a, float64x2_t __b, float64x2_t __c) { +__funline float64x2_t vfmsq_f64(float64x2_t __a, float64x2_t __b, + float64x2_t __c) { return __builtin_aarch64_fmav2df(-__b, __c, __a); } -FUNK float32x2_t vfms_n_f32(float32x2_t __a, float32x2_t __b, float32_t __c) { +__funline float32x2_t vfms_n_f32(float32x2_t __a, float32x2_t __b, + float32_t __c) { return __builtin_aarch64_fmav2sf(-__b, vdup_n_f32(__c), __a); } -FUNK float64x1_t vfms_n_f64(float64x1_t __a, float64x1_t __b, float64_t __c) { +__funline float64x1_t vfms_n_f64(float64x1_t __a, float64x1_t __b, + float64_t __c) { return (float64x1_t){-__b[0] * __c + __a[0]}; } -FUNK float32x4_t vfmsq_n_f32(float32x4_t __a, float32x4_t __b, float32_t __c) { +__funline float32x4_t vfmsq_n_f32(float32x4_t __a, float32x4_t __b, + float32_t __c) { return __builtin_aarch64_fmav4sf(-__b, vdupq_n_f32(__c), __a); } -FUNK float64x2_t vfmsq_n_f64(float64x2_t __a, float64x2_t __b, float64_t __c) { +__funline float64x2_t vfmsq_n_f64(float64x2_t __a, float64x2_t __b, + float64_t __c) { return __builtin_aarch64_fmav2df(-__b, vdupq_n_f64(__c), __a); } -FUNK float32x2_t vfms_lane_f32(float32x2_t __a, float32x2_t __b, - float32x2_t __c, const int __lane) { +__funline float32x2_t vfms_lane_f32(float32x2_t __a, float32x2_t __b, + float32x2_t __c, const int __lane) { return __builtin_aarch64_fmav2sf(-__b, __aarch64_vdup_lane_f32(__c, __lane), __a); } -FUNK float64x1_t vfms_lane_f64(float64x1_t __a, float64x1_t __b, - float64x1_t __c, const int __lane) { +__funline float64x1_t vfms_lane_f64(float64x1_t __a, float64x1_t __b, + float64x1_t __c, const int __lane) { return (float64x1_t){__builtin_fma(-__b[0], __c[0], __a[0])}; } -FUNK float64_t vfmsd_lane_f64(float64_t __a, float64_t __b, float64x1_t __c, - const int __lane) { +__funline float64_t vfmsd_lane_f64(float64_t __a, float64_t __b, + float64x1_t __c, const int __lane) { return __builtin_fma(-__b, __c[0], __a); } -FUNK float32_t vfmss_lane_f32(float32_t __a, float32_t __b, float32x2_t __c, - const int __lane) { +__funline float32_t vfmss_lane_f32(float32_t __a, float32_t __b, + float32x2_t __c, const int __lane) { return __builtin_fmaf(-__b, __aarch64_vget_lane_any(__c, __lane), __a); } -FUNK float32x2_t vfms_laneq_f32(float32x2_t __a, float32x2_t __b, - float32x4_t __c, const int __lane) { +__funline float32x2_t vfms_laneq_f32(float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) { return __builtin_aarch64_fmav2sf(-__b, __aarch64_vdup_laneq_f32(__c, __lane), __a); } -FUNK float64x1_t vfms_laneq_f64(float64x1_t __a, float64x1_t __b, - float64x2_t __c, const int __lane) { +__funline float64x1_t vfms_laneq_f64(float64x1_t __a, float64x1_t __b, + float64x2_t __c, const int __lane) { float64_t __c0 = __aarch64_vget_lane_any(__c, __lane); return (float64x1_t){__builtin_fma(-__b[0], __c0, __a[0])}; } -FUNK float64_t vfmsd_laneq_f64(float64_t __a, float64_t __b, float64x2_t __c, - const int __lane) { +__funline float64_t vfmsd_laneq_f64(float64_t __a, float64_t __b, + float64x2_t __c, const int __lane) { return __builtin_fma(-__b, __aarch64_vget_lane_any(__c, __lane), __a); } -FUNK float32_t vfmss_laneq_f32(float32_t __a, float32_t __b, float32x4_t __c, - const int __lane) { +__funline float32_t vfmss_laneq_f32(float32_t __a, float32_t __b, + float32x4_t __c, const int __lane) { return __builtin_fmaf(-__b, __aarch64_vget_lane_any(__c, __lane), __a); } -FUNK float32x4_t vfmsq_lane_f32(float32x4_t __a, float32x4_t __b, - float32x2_t __c, const int __lane) { +__funline float32x4_t vfmsq_lane_f32(float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) { return __builtin_aarch64_fmav4sf(-__b, __aarch64_vdupq_lane_f32(__c, __lane), __a); } -FUNK float64x2_t vfmsq_lane_f64(float64x2_t __a, float64x2_t __b, - float64x1_t __c, const int __lane) { +__funline float64x2_t vfmsq_lane_f64(float64x2_t __a, float64x2_t __b, + float64x1_t __c, const int __lane) { return __builtin_aarch64_fmav2df(-__b, vdupq_n_f64(__c[0]), __a); } -FUNK float32x4_t vfmsq_laneq_f32(float32x4_t __a, float32x4_t __b, - float32x4_t __c, const int __lane) { +__funline float32x4_t vfmsq_laneq_f32(float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) { return __builtin_aarch64_fmav4sf(-__b, __aarch64_vdupq_laneq_f32(__c, __lane), __a); } -FUNK float64x2_t vfmsq_laneq_f64(float64x2_t __a, float64x2_t __b, - float64x2_t __c, const int __lane) { +__funline float64x2_t vfmsq_laneq_f64(float64x2_t __a, float64x2_t __b, + float64x2_t __c, const int __lane) { return __builtin_aarch64_fmav2df(-__b, __aarch64_vdupq_laneq_f64(__c, __lane), __a); } -FUNK float16x4_t vld1_f16(const float16_t *__a) { +__funline float16x4_t vld1_f16(const float16_t *__a) { return __builtin_aarch64_ld1v4hf(__a); } -FUNK float32x2_t vld1_f32(const float32_t *a) { +__funline float32x2_t vld1_f32(const float32_t *a) { return __builtin_aarch64_ld1v2sf((const __builtin_aarch64_simd_sf *)a); } -FUNK float64x1_t vld1_f64(const float64_t *a) { +__funline float64x1_t vld1_f64(const float64_t *a) { return (float64x1_t){*a}; } -FUNK poly8x8_t vld1_p8(const poly8_t *a) { +__funline poly8x8_t vld1_p8(const poly8_t *a) { return (poly8x8_t)__builtin_aarch64_ld1v8qi( (const __builtin_aarch64_simd_qi *)a); } -FUNK poly16x4_t vld1_p16(const poly16_t *a) { +__funline poly16x4_t vld1_p16(const poly16_t *a) { return (poly16x4_t)__builtin_aarch64_ld1v4hi( (const __builtin_aarch64_simd_hi *)a); } -FUNK poly64x1_t vld1_p64(const poly64_t *a) { +__funline poly64x1_t vld1_p64(const poly64_t *a) { return (poly64x1_t){*a}; } -FUNK int8x8_t vld1_s8(const int8_t *a) { +__funline int8x8_t vld1_s8(const int8_t *a) { return __builtin_aarch64_ld1v8qi((const __builtin_aarch64_simd_qi *)a); } -FUNK int16x4_t vld1_s16(const int16_t *a) { +__funline int16x4_t vld1_s16(const int16_t *a) { return __builtin_aarch64_ld1v4hi((const __builtin_aarch64_simd_hi *)a); } -FUNK int32x2_t vld1_s32(const int32_t *a) { +__funline int32x2_t vld1_s32(const int32_t *a) { return __builtin_aarch64_ld1v2si((const __builtin_aarch64_simd_si *)a); } -FUNK int64x1_t vld1_s64(const int64_t *a) { +__funline int64x1_t vld1_s64(const int64_t *a) { return (int64x1_t){*a}; } -FUNK uint8x8_t vld1_u8(const uint8_t *a) { +__funline uint8x8_t vld1_u8(const uint8_t *a) { return (uint8x8_t)__builtin_aarch64_ld1v8qi( (const __builtin_aarch64_simd_qi *)a); } -FUNK uint16x4_t vld1_u16(const uint16_t *a) { +__funline uint16x4_t vld1_u16(const uint16_t *a) { return (uint16x4_t)__builtin_aarch64_ld1v4hi( (const __builtin_aarch64_simd_hi *)a); } -FUNK uint32x2_t vld1_u32(const uint32_t *a) { +__funline uint32x2_t vld1_u32(const uint32_t *a) { return (uint32x2_t)__builtin_aarch64_ld1v2si( (const __builtin_aarch64_simd_si *)a); } -FUNK uint64x1_t vld1_u64(const uint64_t *a) { +__funline uint64x1_t vld1_u64(const uint64_t *a) { return (uint64x1_t){*a}; } -FUNK uint8x8x3_t vld1_u8_x3(const uint8_t *__a) { +__funline uint8x8x3_t vld1_u8_x3(const uint8_t *__a) { uint8x8x3_t __i; __builtin_aarch64_simd_ci __o; __o = (__builtin_aarch64_simd_ci)__builtin_aarch64_ld1x3v8qi( @@ -11068,7 +11118,7 @@ FUNK uint8x8x3_t vld1_u8_x3(const uint8_t *__a) { return __i; } -FUNK int8x8x3_t vld1_s8_x3(const uint8_t *__a) { +__funline int8x8x3_t vld1_s8_x3(const uint8_t *__a) { int8x8x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -11078,7 +11128,7 @@ FUNK int8x8x3_t vld1_s8_x3(const uint8_t *__a) { return __i; } -FUNK uint16x4x3_t vld1_u16_x3(const uint16_t *__a) { +__funline uint16x4x3_t vld1_u16_x3(const uint16_t *__a) { uint16x4x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -11088,7 +11138,7 @@ FUNK uint16x4x3_t vld1_u16_x3(const uint16_t *__a) { return __i; } -FUNK int16x4x3_t vld1_s16_x3(const int16_t *__a) { +__funline int16x4x3_t vld1_s16_x3(const int16_t *__a) { int16x4x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -11098,7 +11148,7 @@ FUNK int16x4x3_t vld1_s16_x3(const int16_t *__a) { return __i; } -FUNK uint32x2x3_t vld1_u32_x3(const uint32_t *__a) { +__funline uint32x2x3_t vld1_u32_x3(const uint32_t *__a) { uint32x2x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v2si((const __builtin_aarch64_simd_si *)__a); @@ -11108,7 +11158,7 @@ FUNK uint32x2x3_t vld1_u32_x3(const uint32_t *__a) { return __i; } -FUNK int32x2x3_t vld1_s32_x3(const uint32_t *__a) { +__funline int32x2x3_t vld1_s32_x3(const uint32_t *__a) { int32x2x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v2si((const __builtin_aarch64_simd_si *)__a); @@ -11118,7 +11168,7 @@ FUNK int32x2x3_t vld1_s32_x3(const uint32_t *__a) { return __i; } -FUNK uint64x1x3_t vld1_u64_x3(const uint64_t *__a) { +__funline uint64x1x3_t vld1_u64_x3(const uint64_t *__a) { uint64x1x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3di((const __builtin_aarch64_simd_di *)__a); @@ -11128,7 +11178,7 @@ FUNK uint64x1x3_t vld1_u64_x3(const uint64_t *__a) { return __i; } -FUNK int64x1x3_t vld1_s64_x3(const int64_t *__a) { +__funline int64x1x3_t vld1_s64_x3(const int64_t *__a) { int64x1x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3di((const __builtin_aarch64_simd_di *)__a); @@ -11139,7 +11189,7 @@ FUNK int64x1x3_t vld1_s64_x3(const int64_t *__a) { return __i; } -FUNK float16x4x3_t vld1_f16_x3(const float16_t *__a) { +__funline float16x4x3_t vld1_f16_x3(const float16_t *__a) { float16x4x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v4hf((const __builtin_aarch64_simd_hf *)__a); @@ -11149,7 +11199,7 @@ FUNK float16x4x3_t vld1_f16_x3(const float16_t *__a) { return __i; } -FUNK float32x2x3_t vld1_f32_x3(const float32_t *__a) { +__funline float32x2x3_t vld1_f32_x3(const float32_t *__a) { float32x2x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v2sf((const __builtin_aarch64_simd_sf *)__a); @@ -11159,7 +11209,7 @@ FUNK float32x2x3_t vld1_f32_x3(const float32_t *__a) { return __i; } -FUNK float64x1x3_t vld1_f64_x3(const float64_t *__a) { +__funline float64x1x3_t vld1_f64_x3(const float64_t *__a) { float64x1x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3df((const __builtin_aarch64_simd_df *)__a); @@ -11169,7 +11219,7 @@ FUNK float64x1x3_t vld1_f64_x3(const float64_t *__a) { return __i; } -FUNK poly8x8x3_t vld1_p8_x3(const poly8_t *__a) { +__funline poly8x8x3_t vld1_p8_x3(const poly8_t *__a) { poly8x8x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -11179,7 +11229,7 @@ FUNK poly8x8x3_t vld1_p8_x3(const poly8_t *__a) { return __i; } -FUNK poly16x4x3_t vld1_p16_x3(const poly16_t *__a) { +__funline poly16x4x3_t vld1_p16_x3(const poly16_t *__a) { poly16x4x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -11189,7 +11239,7 @@ FUNK poly16x4x3_t vld1_p16_x3(const poly16_t *__a) { return __i; } -FUNK poly64x1x3_t vld1_p64_x3(const poly64_t *__a) { +__funline poly64x1x3_t vld1_p64_x3(const poly64_t *__a) { poly64x1x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3di((const __builtin_aarch64_simd_di *)__a); @@ -11200,7 +11250,7 @@ FUNK poly64x1x3_t vld1_p64_x3(const poly64_t *__a) { return __i; } -FUNK uint8x16x3_t vld1q_u8_x3(const uint8_t *__a) { +__funline uint8x16x3_t vld1q_u8_x3(const uint8_t *__a) { uint8x16x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -11210,7 +11260,7 @@ FUNK uint8x16x3_t vld1q_u8_x3(const uint8_t *__a) { return __i; } -FUNK int8x16x3_t vld1q_s8_x3(const int8_t *__a) { +__funline int8x16x3_t vld1q_s8_x3(const int8_t *__a) { int8x16x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -11220,7 +11270,7 @@ FUNK int8x16x3_t vld1q_s8_x3(const int8_t *__a) { return __i; } -FUNK uint16x8x3_t vld1q_u16_x3(const uint16_t *__a) { +__funline uint16x8x3_t vld1q_u16_x3(const uint16_t *__a) { uint16x8x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -11230,7 +11280,7 @@ FUNK uint16x8x3_t vld1q_u16_x3(const uint16_t *__a) { return __i; } -FUNK int16x8x3_t vld1q_s16_x3(const int16_t *__a) { +__funline int16x8x3_t vld1q_s16_x3(const int16_t *__a) { int16x8x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -11240,7 +11290,7 @@ FUNK int16x8x3_t vld1q_s16_x3(const int16_t *__a) { return __i; } -FUNK uint32x4x3_t vld1q_u32_x3(const uint32_t *__a) { +__funline uint32x4x3_t vld1q_u32_x3(const uint32_t *__a) { uint32x4x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v4si((const __builtin_aarch64_simd_si *)__a); @@ -11250,7 +11300,7 @@ FUNK uint32x4x3_t vld1q_u32_x3(const uint32_t *__a) { return __i; } -FUNK int32x4x3_t vld1q_s32_x3(const int32_t *__a) { +__funline int32x4x3_t vld1q_s32_x3(const int32_t *__a) { int32x4x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v4si((const __builtin_aarch64_simd_si *)__a); @@ -11260,7 +11310,7 @@ FUNK int32x4x3_t vld1q_s32_x3(const int32_t *__a) { return __i; } -FUNK uint64x2x3_t vld1q_u64_x3(const uint64_t *__a) { +__funline uint64x2x3_t vld1q_u64_x3(const uint64_t *__a) { uint64x2x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v2di((const __builtin_aarch64_simd_di *)__a); @@ -11270,7 +11320,7 @@ FUNK uint64x2x3_t vld1q_u64_x3(const uint64_t *__a) { return __i; } -FUNK int64x2x3_t vld1q_s64_x3(const int64_t *__a) { +__funline int64x2x3_t vld1q_s64_x3(const int64_t *__a) { int64x2x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v2di((const __builtin_aarch64_simd_di *)__a); @@ -11280,7 +11330,7 @@ FUNK int64x2x3_t vld1q_s64_x3(const int64_t *__a) { return __i; } -FUNK float16x8x3_t vld1q_f16_x3(const float16_t *__a) { +__funline float16x8x3_t vld1q_f16_x3(const float16_t *__a) { float16x8x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v8hf((const __builtin_aarch64_simd_hf *)__a); @@ -11290,7 +11340,7 @@ FUNK float16x8x3_t vld1q_f16_x3(const float16_t *__a) { return __i; } -FUNK float32x4x3_t vld1q_f32_x3(const float32_t *__a) { +__funline float32x4x3_t vld1q_f32_x3(const float32_t *__a) { float32x4x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v4sf((const __builtin_aarch64_simd_sf *)__a); @@ -11300,7 +11350,7 @@ FUNK float32x4x3_t vld1q_f32_x3(const float32_t *__a) { return __i; } -FUNK float64x2x3_t vld1q_f64_x3(const float64_t *__a) { +__funline float64x2x3_t vld1q_f64_x3(const float64_t *__a) { float64x2x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v2df((const __builtin_aarch64_simd_df *)__a); @@ -11310,7 +11360,7 @@ FUNK float64x2x3_t vld1q_f64_x3(const float64_t *__a) { return __i; } -FUNK poly8x16x3_t vld1q_p8_x3(const poly8_t *__a) { +__funline poly8x16x3_t vld1q_p8_x3(const poly8_t *__a) { poly8x16x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -11320,7 +11370,7 @@ FUNK poly8x16x3_t vld1q_p8_x3(const poly8_t *__a) { return __i; } -FUNK poly16x8x3_t vld1q_p16_x3(const poly16_t *__a) { +__funline poly16x8x3_t vld1q_p16_x3(const poly16_t *__a) { poly16x8x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -11330,7 +11380,7 @@ FUNK poly16x8x3_t vld1q_p16_x3(const poly16_t *__a) { return __i; } -FUNK poly64x2x3_t vld1q_p64_x3(const poly64_t *__a) { +__funline poly64x2x3_t vld1q_p64_x3(const poly64_t *__a) { poly64x2x3_t __i; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld1x3v2di((const __builtin_aarch64_simd_di *)__a); @@ -11340,55 +11390,55 @@ FUNK poly64x2x3_t vld1q_p64_x3(const poly64_t *__a) { return __i; } -FUNK float16x8_t vld1q_f16(const float16_t *__a) { +__funline float16x8_t vld1q_f16(const float16_t *__a) { return __builtin_aarch64_ld1v8hf(__a); } -FUNK float32x4_t vld1q_f32(const float32_t *a) { +__funline float32x4_t vld1q_f32(const float32_t *a) { return __builtin_aarch64_ld1v4sf((const __builtin_aarch64_simd_sf *)a); } -FUNK float64x2_t vld1q_f64(const float64_t *a) { +__funline float64x2_t vld1q_f64(const float64_t *a) { return __builtin_aarch64_ld1v2df((const __builtin_aarch64_simd_df *)a); } -FUNK poly8x16_t vld1q_p8(const poly8_t *a) { +__funline poly8x16_t vld1q_p8(const poly8_t *a) { return (poly8x16_t)__builtin_aarch64_ld1v16qi( (const __builtin_aarch64_simd_qi *)a); } -FUNK poly16x8_t vld1q_p16(const poly16_t *a) { +__funline poly16x8_t vld1q_p16(const poly16_t *a) { return (poly16x8_t)__builtin_aarch64_ld1v8hi( (const __builtin_aarch64_simd_hi *)a); } -FUNK poly64x2_t vld1q_p64(const poly64_t *a) { +__funline poly64x2_t vld1q_p64(const poly64_t *a) { return (poly64x2_t)__builtin_aarch64_ld1v2di( (const __builtin_aarch64_simd_di *)a); } -FUNK int8x16_t vld1q_s8(const int8_t *a) { +__funline int8x16_t vld1q_s8(const int8_t *a) { return __builtin_aarch64_ld1v16qi((const __builtin_aarch64_simd_qi *)a); } -FUNK int16x8_t vld1q_s16(const int16_t *a) { +__funline int16x8_t vld1q_s16(const int16_t *a) { return __builtin_aarch64_ld1v8hi((const __builtin_aarch64_simd_hi *)a); } -FUNK int32x4_t vld1q_s32(const int32_t *a) { +__funline int32x4_t vld1q_s32(const int32_t *a) { return __builtin_aarch64_ld1v4si((const __builtin_aarch64_simd_si *)a); } -FUNK int64x2_t vld1q_s64(const int64_t *a) { +__funline int64x2_t vld1q_s64(const int64_t *a) { return __builtin_aarch64_ld1v2di((const __builtin_aarch64_simd_di *)a); } -FUNK uint8x16_t vld1q_u8(const uint8_t *a) { +__funline uint8x16_t vld1q_u8(const uint8_t *a) { return (uint8x16_t)__builtin_aarch64_ld1v16qi( (const __builtin_aarch64_simd_qi *)a); } -FUNK uint8x8x2_t vld1_u8_x2(const uint8_t *__a) { +__funline uint8x8x2_t vld1_u8_x2(const uint8_t *__a) { uint8x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -11397,7 +11447,7 @@ FUNK uint8x8x2_t vld1_u8_x2(const uint8_t *__a) { return ret; } -FUNK int8x8x2_t vld1_s8_x2(const int8_t *__a) { +__funline int8x8x2_t vld1_s8_x2(const int8_t *__a) { int8x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -11406,7 +11456,7 @@ FUNK int8x8x2_t vld1_s8_x2(const int8_t *__a) { return ret; } -FUNK uint16x4x2_t vld1_u16_x2(const uint16_t *__a) { +__funline uint16x4x2_t vld1_u16_x2(const uint16_t *__a) { uint16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -11415,7 +11465,7 @@ FUNK uint16x4x2_t vld1_u16_x2(const uint16_t *__a) { return ret; } -FUNK int16x4x2_t vld1_s16_x2(const int16_t *__a) { +__funline int16x4x2_t vld1_s16_x2(const int16_t *__a) { int16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -11424,7 +11474,7 @@ FUNK int16x4x2_t vld1_s16_x2(const int16_t *__a) { return ret; } -FUNK uint32x2x2_t vld1_u32_x2(const uint32_t *__a) { +__funline uint32x2x2_t vld1_u32_x2(const uint32_t *__a) { uint32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v2si((const __builtin_aarch64_simd_si *)__a); @@ -11433,7 +11483,7 @@ FUNK uint32x2x2_t vld1_u32_x2(const uint32_t *__a) { return ret; } -FUNK int32x2x2_t vld1_s32_x2(const int32_t *__a) { +__funline int32x2x2_t vld1_s32_x2(const int32_t *__a) { int32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v2si((const __builtin_aarch64_simd_si *)__a); @@ -11442,7 +11492,7 @@ FUNK int32x2x2_t vld1_s32_x2(const int32_t *__a) { return ret; } -FUNK uint64x1x2_t vld1_u64_x2(const uint64_t *__a) { +__funline uint64x1x2_t vld1_u64_x2(const uint64_t *__a) { uint64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2di((const __builtin_aarch64_simd_di *)__a); @@ -11451,7 +11501,7 @@ FUNK uint64x1x2_t vld1_u64_x2(const uint64_t *__a) { return ret; } -FUNK int64x1x2_t vld1_s64_x2(const int64_t *__a) { +__funline int64x1x2_t vld1_s64_x2(const int64_t *__a) { int64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2di((const __builtin_aarch64_simd_di *)__a); @@ -11460,7 +11510,7 @@ FUNK int64x1x2_t vld1_s64_x2(const int64_t *__a) { return ret; } -FUNK float16x4x2_t vld1_f16_x2(const float16_t *__a) { +__funline float16x4x2_t vld1_f16_x2(const float16_t *__a) { float16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v4hf((const __builtin_aarch64_simd_hf *)__a); @@ -11469,7 +11519,7 @@ FUNK float16x4x2_t vld1_f16_x2(const float16_t *__a) { return ret; } -FUNK float32x2x2_t vld1_f32_x2(const float32_t *__a) { +__funline float32x2x2_t vld1_f32_x2(const float32_t *__a) { float32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v2sf((const __builtin_aarch64_simd_sf *)__a); @@ -11478,7 +11528,7 @@ FUNK float32x2x2_t vld1_f32_x2(const float32_t *__a) { return ret; } -FUNK float64x1x2_t vld1_f64_x2(const float64_t *__a) { +__funline float64x1x2_t vld1_f64_x2(const float64_t *__a) { float64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2df((const __builtin_aarch64_simd_df *)__a); @@ -11487,7 +11537,7 @@ FUNK float64x1x2_t vld1_f64_x2(const float64_t *__a) { return ret; } -FUNK poly8x8x2_t vld1_p8_x2(const poly8_t *__a) { +__funline poly8x8x2_t vld1_p8_x2(const poly8_t *__a) { poly8x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -11496,7 +11546,7 @@ FUNK poly8x8x2_t vld1_p8_x2(const poly8_t *__a) { return ret; } -FUNK poly16x4x2_t vld1_p16_x2(const poly16_t *__a) { +__funline poly16x4x2_t vld1_p16_x2(const poly16_t *__a) { poly16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -11505,7 +11555,7 @@ FUNK poly16x4x2_t vld1_p16_x2(const poly16_t *__a) { return ret; } -FUNK poly64x1x2_t vld1_p64_x2(const poly64_t *__a) { +__funline poly64x1x2_t vld1_p64_x2(const poly64_t *__a) { poly64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2di((const __builtin_aarch64_simd_di *)__a); @@ -11514,7 +11564,7 @@ FUNK poly64x1x2_t vld1_p64_x2(const poly64_t *__a) { return ret; } -FUNK uint8x16x2_t vld1q_u8_x2(const uint8_t *__a) { +__funline uint8x16x2_t vld1q_u8_x2(const uint8_t *__a) { uint8x16x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -11523,7 +11573,7 @@ FUNK uint8x16x2_t vld1q_u8_x2(const uint8_t *__a) { return ret; } -FUNK int8x16x2_t vld1q_s8_x2(const int8_t *__a) { +__funline int8x16x2_t vld1q_s8_x2(const int8_t *__a) { int8x16x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -11532,7 +11582,7 @@ FUNK int8x16x2_t vld1q_s8_x2(const int8_t *__a) { return ret; } -FUNK uint16x8x2_t vld1q_u16_x2(const uint16_t *__a) { +__funline uint16x8x2_t vld1q_u16_x2(const uint16_t *__a) { uint16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -11541,7 +11591,7 @@ FUNK uint16x8x2_t vld1q_u16_x2(const uint16_t *__a) { return ret; } -FUNK int16x8x2_t vld1q_s16_x2(const int16_t *__a) { +__funline int16x8x2_t vld1q_s16_x2(const int16_t *__a) { int16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -11550,7 +11600,7 @@ FUNK int16x8x2_t vld1q_s16_x2(const int16_t *__a) { return ret; } -FUNK uint32x4x2_t vld1q_u32_x2(const uint32_t *__a) { +__funline uint32x4x2_t vld1q_u32_x2(const uint32_t *__a) { uint32x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v4si((const __builtin_aarch64_simd_si *)__a); @@ -11559,7 +11609,7 @@ FUNK uint32x4x2_t vld1q_u32_x2(const uint32_t *__a) { return ret; } -FUNK int32x4x2_t vld1q_s32_x2(const int32_t *__a) { +__funline int32x4x2_t vld1q_s32_x2(const int32_t *__a) { int32x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v4si((const __builtin_aarch64_simd_si *)__a); @@ -11568,7 +11618,7 @@ FUNK int32x4x2_t vld1q_s32_x2(const int32_t *__a) { return ret; } -FUNK uint64x2x2_t vld1q_u64_x2(const uint64_t *__a) { +__funline uint64x2x2_t vld1q_u64_x2(const uint64_t *__a) { uint64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v2di((const __builtin_aarch64_simd_di *)__a); @@ -11577,7 +11627,7 @@ FUNK uint64x2x2_t vld1q_u64_x2(const uint64_t *__a) { return ret; } -FUNK int64x2x2_t vld1q_s64_x2(const int64_t *__a) { +__funline int64x2x2_t vld1q_s64_x2(const int64_t *__a) { int64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v2di((const __builtin_aarch64_simd_di *)__a); @@ -11586,7 +11636,7 @@ FUNK int64x2x2_t vld1q_s64_x2(const int64_t *__a) { return ret; } -FUNK float16x8x2_t vld1q_f16_x2(const float16_t *__a) { +__funline float16x8x2_t vld1q_f16_x2(const float16_t *__a) { float16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v8hf((const __builtin_aarch64_simd_hf *)__a); @@ -11595,7 +11645,7 @@ FUNK float16x8x2_t vld1q_f16_x2(const float16_t *__a) { return ret; } -FUNK float32x4x2_t vld1q_f32_x2(const float32_t *__a) { +__funline float32x4x2_t vld1q_f32_x2(const float32_t *__a) { float32x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v4sf((const __builtin_aarch64_simd_sf *)__a); @@ -11604,7 +11654,7 @@ FUNK float32x4x2_t vld1q_f32_x2(const float32_t *__a) { return ret; } -FUNK float64x2x2_t vld1q_f64_x2(const float64_t *__a) { +__funline float64x2x2_t vld1q_f64_x2(const float64_t *__a) { float64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v2df((const __builtin_aarch64_simd_df *)__a); @@ -11613,7 +11663,7 @@ FUNK float64x2x2_t vld1q_f64_x2(const float64_t *__a) { return ret; } -FUNK poly8x16x2_t vld1q_p8_x2(const poly8_t *__a) { +__funline poly8x16x2_t vld1q_p8_x2(const poly8_t *__a) { poly8x16x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -11622,7 +11672,7 @@ FUNK poly8x16x2_t vld1q_p8_x2(const poly8_t *__a) { return ret; } -FUNK poly16x8x2_t vld1q_p16_x2(const poly16_t *__a) { +__funline poly16x8x2_t vld1q_p16_x2(const poly16_t *__a) { poly16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -11631,7 +11681,7 @@ FUNK poly16x8x2_t vld1q_p16_x2(const poly16_t *__a) { return ret; } -FUNK poly64x2x2_t vld1q_p64_x2(const poly64_t *__a) { +__funline poly64x2x2_t vld1q_p64_x2(const poly64_t *__a) { poly64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld1x2v2di((const __builtin_aarch64_simd_di *)__a); @@ -11640,274 +11690,274 @@ FUNK poly64x2x2_t vld1q_p64_x2(const poly64_t *__a) { return ret; } -FUNK uint16x8_t vld1q_u16(const uint16_t *a) { +__funline uint16x8_t vld1q_u16(const uint16_t *a) { return (uint16x8_t)__builtin_aarch64_ld1v8hi( (const __builtin_aarch64_simd_hi *)a); } -FUNK uint32x4_t vld1q_u32(const uint32_t *a) { +__funline uint32x4_t vld1q_u32(const uint32_t *a) { return (uint32x4_t)__builtin_aarch64_ld1v4si( (const __builtin_aarch64_simd_si *)a); } -FUNK uint64x2_t vld1q_u64(const uint64_t *a) { +__funline uint64x2_t vld1q_u64(const uint64_t *a) { return (uint64x2_t)__builtin_aarch64_ld1v2di( (const __builtin_aarch64_simd_di *)a); } -FUNK float16x4_t vld1_dup_f16(const float16_t *__a) { +__funline float16x4_t vld1_dup_f16(const float16_t *__a) { return vdup_n_f16(*__a); } -FUNK float32x2_t vld1_dup_f32(const float32_t *__a) { +__funline float32x2_t vld1_dup_f32(const float32_t *__a) { return vdup_n_f32(*__a); } -FUNK float64x1_t vld1_dup_f64(const float64_t *__a) { +__funline float64x1_t vld1_dup_f64(const float64_t *__a) { return vdup_n_f64(*__a); } -FUNK poly8x8_t vld1_dup_p8(const poly8_t *__a) { +__funline poly8x8_t vld1_dup_p8(const poly8_t *__a) { return vdup_n_p8(*__a); } -FUNK poly16x4_t vld1_dup_p16(const poly16_t *__a) { +__funline poly16x4_t vld1_dup_p16(const poly16_t *__a) { return vdup_n_p16(*__a); } -FUNK poly64x1_t vld1_dup_p64(const poly64_t *__a) { +__funline poly64x1_t vld1_dup_p64(const poly64_t *__a) { return vdup_n_p64(*__a); } -FUNK int8x8_t vld1_dup_s8(const int8_t *__a) { +__funline int8x8_t vld1_dup_s8(const int8_t *__a) { return vdup_n_s8(*__a); } -FUNK int16x4_t vld1_dup_s16(const int16_t *__a) { +__funline int16x4_t vld1_dup_s16(const int16_t *__a) { return vdup_n_s16(*__a); } -FUNK int32x2_t vld1_dup_s32(const int32_t *__a) { +__funline int32x2_t vld1_dup_s32(const int32_t *__a) { return vdup_n_s32(*__a); } -FUNK int64x1_t vld1_dup_s64(const int64_t *__a) { +__funline int64x1_t vld1_dup_s64(const int64_t *__a) { return vdup_n_s64(*__a); } -FUNK uint8x8_t vld1_dup_u8(const uint8_t *__a) { +__funline uint8x8_t vld1_dup_u8(const uint8_t *__a) { return vdup_n_u8(*__a); } -FUNK uint16x4_t vld1_dup_u16(const uint16_t *__a) { +__funline uint16x4_t vld1_dup_u16(const uint16_t *__a) { return vdup_n_u16(*__a); } -FUNK uint32x2_t vld1_dup_u32(const uint32_t *__a) { +__funline uint32x2_t vld1_dup_u32(const uint32_t *__a) { return vdup_n_u32(*__a); } -FUNK uint64x1_t vld1_dup_u64(const uint64_t *__a) { +__funline uint64x1_t vld1_dup_u64(const uint64_t *__a) { return vdup_n_u64(*__a); } -FUNK float16x8_t vld1q_dup_f16(const float16_t *__a) { +__funline float16x8_t vld1q_dup_f16(const float16_t *__a) { return vdupq_n_f16(*__a); } -FUNK float32x4_t vld1q_dup_f32(const float32_t *__a) { +__funline float32x4_t vld1q_dup_f32(const float32_t *__a) { return vdupq_n_f32(*__a); } -FUNK float64x2_t vld1q_dup_f64(const float64_t *__a) { +__funline float64x2_t vld1q_dup_f64(const float64_t *__a) { return vdupq_n_f64(*__a); } -FUNK poly8x16_t vld1q_dup_p8(const poly8_t *__a) { +__funline poly8x16_t vld1q_dup_p8(const poly8_t *__a) { return vdupq_n_p8(*__a); } -FUNK poly16x8_t vld1q_dup_p16(const poly16_t *__a) { +__funline poly16x8_t vld1q_dup_p16(const poly16_t *__a) { return vdupq_n_p16(*__a); } -FUNK poly64x2_t vld1q_dup_p64(const poly64_t *__a) { +__funline poly64x2_t vld1q_dup_p64(const poly64_t *__a) { return vdupq_n_p64(*__a); } -FUNK int8x16_t vld1q_dup_s8(const int8_t *__a) { +__funline int8x16_t vld1q_dup_s8(const int8_t *__a) { return vdupq_n_s8(*__a); } -FUNK int16x8_t vld1q_dup_s16(const int16_t *__a) { +__funline int16x8_t vld1q_dup_s16(const int16_t *__a) { return vdupq_n_s16(*__a); } -FUNK int32x4_t vld1q_dup_s32(const int32_t *__a) { +__funline int32x4_t vld1q_dup_s32(const int32_t *__a) { return vdupq_n_s32(*__a); } -FUNK int64x2_t vld1q_dup_s64(const int64_t *__a) { +__funline int64x2_t vld1q_dup_s64(const int64_t *__a) { return vdupq_n_s64(*__a); } -FUNK uint8x16_t vld1q_dup_u8(const uint8_t *__a) { +__funline uint8x16_t vld1q_dup_u8(const uint8_t *__a) { return vdupq_n_u8(*__a); } -FUNK uint16x8_t vld1q_dup_u16(const uint16_t *__a) { +__funline uint16x8_t vld1q_dup_u16(const uint16_t *__a) { return vdupq_n_u16(*__a); } -FUNK uint32x4_t vld1q_dup_u32(const uint32_t *__a) { +__funline uint32x4_t vld1q_dup_u32(const uint32_t *__a) { return vdupq_n_u32(*__a); } -FUNK uint64x2_t vld1q_dup_u64(const uint64_t *__a) { +__funline uint64x2_t vld1q_dup_u64(const uint64_t *__a) { return vdupq_n_u64(*__a); } -FUNK float16x4_t vld1_lane_f16(const float16_t *__src, float16x4_t __vec, - const int __lane) { +__funline float16x4_t vld1_lane_f16(const float16_t *__src, float16x4_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK float32x2_t vld1_lane_f32(const float32_t *__src, float32x2_t __vec, - const int __lane) { +__funline float32x2_t vld1_lane_f32(const float32_t *__src, float32x2_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK float64x1_t vld1_lane_f64(const float64_t *__src, float64x1_t __vec, - const int __lane) { +__funline float64x1_t vld1_lane_f64(const float64_t *__src, float64x1_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK poly8x8_t vld1_lane_p8(const poly8_t *__src, poly8x8_t __vec, - const int __lane) { +__funline poly8x8_t vld1_lane_p8(const poly8_t *__src, poly8x8_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK poly16x4_t vld1_lane_p16(const poly16_t *__src, poly16x4_t __vec, - const int __lane) { +__funline poly16x4_t vld1_lane_p16(const poly16_t *__src, poly16x4_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK poly64x1_t vld1_lane_p64(const poly64_t *__src, poly64x1_t __vec, - const int __lane) { +__funline poly64x1_t vld1_lane_p64(const poly64_t *__src, poly64x1_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK int8x8_t vld1_lane_s8(const int8_t *__src, int8x8_t __vec, - const int __lane) { - return __aarch64_vset_lane_any(*__src, __vec, __lane); -} - -FUNK int16x4_t vld1_lane_s16(const int16_t *__src, int16x4_t __vec, - const int __lane) { - return __aarch64_vset_lane_any(*__src, __vec, __lane); -} - -FUNK int32x2_t vld1_lane_s32(const int32_t *__src, int32x2_t __vec, - const int __lane) { - return __aarch64_vset_lane_any(*__src, __vec, __lane); -} - -FUNK int64x1_t vld1_lane_s64(const int64_t *__src, int64x1_t __vec, - const int __lane) { - return __aarch64_vset_lane_any(*__src, __vec, __lane); -} - -FUNK uint8x8_t vld1_lane_u8(const uint8_t *__src, uint8x8_t __vec, - const int __lane) { - return __aarch64_vset_lane_any(*__src, __vec, __lane); -} - -FUNK uint16x4_t vld1_lane_u16(const uint16_t *__src, uint16x4_t __vec, - const int __lane) { - return __aarch64_vset_lane_any(*__src, __vec, __lane); -} - -FUNK uint32x2_t vld1_lane_u32(const uint32_t *__src, uint32x2_t __vec, - const int __lane) { - return __aarch64_vset_lane_any(*__src, __vec, __lane); -} - -FUNK uint64x1_t vld1_lane_u64(const uint64_t *__src, uint64x1_t __vec, - const int __lane) { - return __aarch64_vset_lane_any(*__src, __vec, __lane); -} - -FUNK float16x8_t vld1q_lane_f16(const float16_t *__src, float16x8_t __vec, +__funline int8x8_t vld1_lane_s8(const int8_t *__src, int8x8_t __vec, const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK float32x4_t vld1q_lane_f32(const float32_t *__src, float32x4_t __vec, - const int __lane) { +__funline int16x4_t vld1_lane_s16(const int16_t *__src, int16x4_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK float64x2_t vld1q_lane_f64(const float64_t *__src, float64x2_t __vec, - const int __lane) { +__funline int32x2_t vld1_lane_s32(const int32_t *__src, int32x2_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK poly8x16_t vld1q_lane_p8(const poly8_t *__src, poly8x16_t __vec, - const int __lane) { +__funline int64x1_t vld1_lane_s64(const int64_t *__src, int64x1_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK poly16x8_t vld1q_lane_p16(const poly16_t *__src, poly16x8_t __vec, - const int __lane) { +__funline uint8x8_t vld1_lane_u8(const uint8_t *__src, uint8x8_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK poly64x2_t vld1q_lane_p64(const poly64_t *__src, poly64x2_t __vec, - const int __lane) { +__funline uint16x4_t vld1_lane_u16(const uint16_t *__src, uint16x4_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK int8x16_t vld1q_lane_s8(const int8_t *__src, int8x16_t __vec, - const int __lane) { +__funline uint32x2_t vld1_lane_u32(const uint32_t *__src, uint32x2_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK int16x8_t vld1q_lane_s16(const int16_t *__src, int16x8_t __vec, - const int __lane) { +__funline uint64x1_t vld1_lane_u64(const uint64_t *__src, uint64x1_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK int32x4_t vld1q_lane_s32(const int32_t *__src, int32x4_t __vec, - const int __lane) { +__funline float16x8_t vld1q_lane_f16(const float16_t *__src, float16x8_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK int64x2_t vld1q_lane_s64(const int64_t *__src, int64x2_t __vec, - const int __lane) { +__funline float32x4_t vld1q_lane_f32(const float32_t *__src, float32x4_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK uint8x16_t vld1q_lane_u8(const uint8_t *__src, uint8x16_t __vec, - const int __lane) { +__funline float64x2_t vld1q_lane_f64(const float64_t *__src, float64x2_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK uint16x8_t vld1q_lane_u16(const uint16_t *__src, uint16x8_t __vec, - const int __lane) { +__funline poly8x16_t vld1q_lane_p8(const poly8_t *__src, poly8x16_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK uint32x4_t vld1q_lane_u32(const uint32_t *__src, uint32x4_t __vec, - const int __lane) { +__funline poly16x8_t vld1q_lane_p16(const poly16_t *__src, poly16x8_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK uint64x2_t vld1q_lane_u64(const uint64_t *__src, uint64x2_t __vec, - const int __lane) { +__funline poly64x2_t vld1q_lane_p64(const poly64_t *__src, poly64x2_t __vec, + const int __lane) { return __aarch64_vset_lane_any(*__src, __vec, __lane); } -FUNK int64x1x2_t vld2_s64(const int64_t *__a) { +__funline int8x16_t vld1q_lane_s8(const int8_t *__src, int8x16_t __vec, + const int __lane) { + return __aarch64_vset_lane_any(*__src, __vec, __lane); +} + +__funline int16x8_t vld1q_lane_s16(const int16_t *__src, int16x8_t __vec, + const int __lane) { + return __aarch64_vset_lane_any(*__src, __vec, __lane); +} + +__funline int32x4_t vld1q_lane_s32(const int32_t *__src, int32x4_t __vec, + const int __lane) { + return __aarch64_vset_lane_any(*__src, __vec, __lane); +} + +__funline int64x2_t vld1q_lane_s64(const int64_t *__src, int64x2_t __vec, + const int __lane) { + return __aarch64_vset_lane_any(*__src, __vec, __lane); +} + +__funline uint8x16_t vld1q_lane_u8(const uint8_t *__src, uint8x16_t __vec, + const int __lane) { + return __aarch64_vset_lane_any(*__src, __vec, __lane); +} + +__funline uint16x8_t vld1q_lane_u16(const uint16_t *__src, uint16x8_t __vec, + const int __lane) { + return __aarch64_vset_lane_any(*__src, __vec, __lane); +} + +__funline uint32x4_t vld1q_lane_u32(const uint32_t *__src, uint32x4_t __vec, + const int __lane) { + return __aarch64_vset_lane_any(*__src, __vec, __lane); +} + +__funline uint64x2_t vld1q_lane_u64(const uint64_t *__src, uint64x2_t __vec, + const int __lane) { + return __aarch64_vset_lane_any(*__src, __vec, __lane); +} + +__funline int64x1x2_t vld2_s64(const int64_t *__a) { int64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2di((const __builtin_aarch64_simd_di *)__a); @@ -11916,7 +11966,7 @@ FUNK int64x1x2_t vld2_s64(const int64_t *__a) { return ret; } -FUNK uint64x1x2_t vld2_u64(const uint64_t *__a) { +__funline uint64x1x2_t vld2_u64(const uint64_t *__a) { uint64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2di((const __builtin_aarch64_simd_di *)__a); @@ -11925,7 +11975,7 @@ FUNK uint64x1x2_t vld2_u64(const uint64_t *__a) { return ret; } -FUNK float64x1x2_t vld2_f64(const float64_t *__a) { +__funline float64x1x2_t vld2_f64(const float64_t *__a) { float64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2df((const __builtin_aarch64_simd_df *)__a); @@ -11934,7 +11984,7 @@ FUNK float64x1x2_t vld2_f64(const float64_t *__a) { return ret; } -FUNK int8x8x2_t vld2_s8(const int8_t *__a) { +__funline int8x8x2_t vld2_s8(const int8_t *__a) { int8x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -11943,7 +11993,7 @@ FUNK int8x8x2_t vld2_s8(const int8_t *__a) { return ret; } -FUNK poly8x8x2_t vld2_p8(const poly8_t *__a) { +__funline poly8x8x2_t vld2_p8(const poly8_t *__a) { poly8x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -11952,7 +12002,7 @@ FUNK poly8x8x2_t vld2_p8(const poly8_t *__a) { return ret; } -FUNK poly64x1x2_t vld2_p64(const poly64_t *__a) { +__funline poly64x1x2_t vld2_p64(const poly64_t *__a) { poly64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2di((const __builtin_aarch64_simd_di *)__a); @@ -11961,7 +12011,7 @@ FUNK poly64x1x2_t vld2_p64(const poly64_t *__a) { return ret; } -FUNK int16x4x2_t vld2_s16(const int16_t *__a) { +__funline int16x4x2_t vld2_s16(const int16_t *__a) { int16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -11970,7 +12020,7 @@ FUNK int16x4x2_t vld2_s16(const int16_t *__a) { return ret; } -FUNK poly16x4x2_t vld2_p16(const poly16_t *__a) { +__funline poly16x4x2_t vld2_p16(const poly16_t *__a) { poly16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -11979,7 +12029,7 @@ FUNK poly16x4x2_t vld2_p16(const poly16_t *__a) { return ret; } -FUNK int32x2x2_t vld2_s32(const int32_t *__a) { +__funline int32x2x2_t vld2_s32(const int32_t *__a) { int32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v2si((const __builtin_aarch64_simd_si *)__a); @@ -11988,7 +12038,7 @@ FUNK int32x2x2_t vld2_s32(const int32_t *__a) { return ret; } -FUNK uint8x8x2_t vld2_u8(const uint8_t *__a) { +__funline uint8x8x2_t vld2_u8(const uint8_t *__a) { uint8x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -11997,7 +12047,7 @@ FUNK uint8x8x2_t vld2_u8(const uint8_t *__a) { return ret; } -FUNK uint16x4x2_t vld2_u16(const uint16_t *__a) { +__funline uint16x4x2_t vld2_u16(const uint16_t *__a) { uint16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -12006,7 +12056,7 @@ FUNK uint16x4x2_t vld2_u16(const uint16_t *__a) { return ret; } -FUNK uint32x2x2_t vld2_u32(const uint32_t *__a) { +__funline uint32x2x2_t vld2_u32(const uint32_t *__a) { uint32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v2si((const __builtin_aarch64_simd_si *)__a); @@ -12015,7 +12065,7 @@ FUNK uint32x2x2_t vld2_u32(const uint32_t *__a) { return ret; } -FUNK float16x4x2_t vld2_f16(const float16_t *__a) { +__funline float16x4x2_t vld2_f16(const float16_t *__a) { float16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v4hf(__a); @@ -12024,7 +12074,7 @@ FUNK float16x4x2_t vld2_f16(const float16_t *__a) { return ret; } -FUNK float32x2x2_t vld2_f32(const float32_t *__a) { +__funline float32x2x2_t vld2_f32(const float32_t *__a) { float32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v2sf((const __builtin_aarch64_simd_sf *)__a); @@ -12033,7 +12083,7 @@ FUNK float32x2x2_t vld2_f32(const float32_t *__a) { return ret; } -FUNK int8x16x2_t vld2q_s8(const int8_t *__a) { +__funline int8x16x2_t vld2q_s8(const int8_t *__a) { int8x16x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12042,7 +12092,7 @@ FUNK int8x16x2_t vld2q_s8(const int8_t *__a) { return ret; } -FUNK poly8x16x2_t vld2q_p8(const poly8_t *__a) { +__funline poly8x16x2_t vld2q_p8(const poly8_t *__a) { poly8x16x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12051,7 +12101,7 @@ FUNK poly8x16x2_t vld2q_p8(const poly8_t *__a) { return ret; } -FUNK int16x8x2_t vld2q_s16(const int16_t *__a) { +__funline int16x8x2_t vld2q_s16(const int16_t *__a) { int16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12060,7 +12110,7 @@ FUNK int16x8x2_t vld2q_s16(const int16_t *__a) { return ret; } -FUNK poly16x8x2_t vld2q_p16(const poly16_t *__a) { +__funline poly16x8x2_t vld2q_p16(const poly16_t *__a) { poly16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12069,7 +12119,7 @@ FUNK poly16x8x2_t vld2q_p16(const poly16_t *__a) { return ret; } -FUNK poly64x2x2_t vld2q_p64(const poly64_t *__a) { +__funline poly64x2x2_t vld2q_p64(const poly64_t *__a) { poly64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v2di((const __builtin_aarch64_simd_di *)__a); @@ -12078,7 +12128,7 @@ FUNK poly64x2x2_t vld2q_p64(const poly64_t *__a) { return ret; } -FUNK int32x4x2_t vld2q_s32(const int32_t *__a) { +__funline int32x4x2_t vld2q_s32(const int32_t *__a) { int32x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v4si((const __builtin_aarch64_simd_si *)__a); @@ -12087,7 +12137,7 @@ FUNK int32x4x2_t vld2q_s32(const int32_t *__a) { return ret; } -FUNK int64x2x2_t vld2q_s64(const int64_t *__a) { +__funline int64x2x2_t vld2q_s64(const int64_t *__a) { int64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v2di((const __builtin_aarch64_simd_di *)__a); @@ -12096,7 +12146,7 @@ FUNK int64x2x2_t vld2q_s64(const int64_t *__a) { return ret; } -FUNK uint8x16x2_t vld2q_u8(const uint8_t *__a) { +__funline uint8x16x2_t vld2q_u8(const uint8_t *__a) { uint8x16x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12105,7 +12155,7 @@ FUNK uint8x16x2_t vld2q_u8(const uint8_t *__a) { return ret; } -FUNK uint16x8x2_t vld2q_u16(const uint16_t *__a) { +__funline uint16x8x2_t vld2q_u16(const uint16_t *__a) { uint16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12114,7 +12164,7 @@ FUNK uint16x8x2_t vld2q_u16(const uint16_t *__a) { return ret; } -FUNK uint32x4x2_t vld2q_u32(const uint32_t *__a) { +__funline uint32x4x2_t vld2q_u32(const uint32_t *__a) { uint32x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v4si((const __builtin_aarch64_simd_si *)__a); @@ -12123,7 +12173,7 @@ FUNK uint32x4x2_t vld2q_u32(const uint32_t *__a) { return ret; } -FUNK uint64x2x2_t vld2q_u64(const uint64_t *__a) { +__funline uint64x2x2_t vld2q_u64(const uint64_t *__a) { uint64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v2di((const __builtin_aarch64_simd_di *)__a); @@ -12132,7 +12182,7 @@ FUNK uint64x2x2_t vld2q_u64(const uint64_t *__a) { return ret; } -FUNK float16x8x2_t vld2q_f16(const float16_t *__a) { +__funline float16x8x2_t vld2q_f16(const float16_t *__a) { float16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v8hf(__a); @@ -12141,7 +12191,7 @@ FUNK float16x8x2_t vld2q_f16(const float16_t *__a) { return ret; } -FUNK float32x4x2_t vld2q_f32(const float32_t *__a) { +__funline float32x4x2_t vld2q_f32(const float32_t *__a) { float32x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v4sf((const __builtin_aarch64_simd_sf *)__a); @@ -12150,7 +12200,7 @@ FUNK float32x4x2_t vld2q_f32(const float32_t *__a) { return ret; } -FUNK float64x2x2_t vld2q_f64(const float64_t *__a) { +__funline float64x2x2_t vld2q_f64(const float64_t *__a) { float64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v2df((const __builtin_aarch64_simd_df *)__a); @@ -12159,7 +12209,7 @@ FUNK float64x2x2_t vld2q_f64(const float64_t *__a) { return ret; } -FUNK int64x1x3_t vld3_s64(const int64_t *__a) { +__funline int64x1x3_t vld3_s64(const int64_t *__a) { int64x1x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3di((const __builtin_aarch64_simd_di *)__a); @@ -12169,7 +12219,7 @@ FUNK int64x1x3_t vld3_s64(const int64_t *__a) { return ret; } -FUNK uint64x1x3_t vld3_u64(const uint64_t *__a) { +__funline uint64x1x3_t vld3_u64(const uint64_t *__a) { uint64x1x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3di((const __builtin_aarch64_simd_di *)__a); @@ -12179,7 +12229,7 @@ FUNK uint64x1x3_t vld3_u64(const uint64_t *__a) { return ret; } -FUNK float64x1x3_t vld3_f64(const float64_t *__a) { +__funline float64x1x3_t vld3_f64(const float64_t *__a) { float64x1x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3df((const __builtin_aarch64_simd_df *)__a); @@ -12189,7 +12239,7 @@ FUNK float64x1x3_t vld3_f64(const float64_t *__a) { return ret; } -FUNK int8x8x3_t vld3_s8(const int8_t *__a) { +__funline int8x8x3_t vld3_s8(const int8_t *__a) { int8x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -12199,7 +12249,7 @@ FUNK int8x8x3_t vld3_s8(const int8_t *__a) { return ret; } -FUNK poly8x8x3_t vld3_p8(const poly8_t *__a) { +__funline poly8x8x3_t vld3_p8(const poly8_t *__a) { poly8x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -12209,7 +12259,7 @@ FUNK poly8x8x3_t vld3_p8(const poly8_t *__a) { return ret; } -FUNK int16x4x3_t vld3_s16(const int16_t *__a) { +__funline int16x4x3_t vld3_s16(const int16_t *__a) { int16x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -12219,7 +12269,7 @@ FUNK int16x4x3_t vld3_s16(const int16_t *__a) { return ret; } -FUNK poly16x4x3_t vld3_p16(const poly16_t *__a) { +__funline poly16x4x3_t vld3_p16(const poly16_t *__a) { poly16x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -12229,7 +12279,7 @@ FUNK poly16x4x3_t vld3_p16(const poly16_t *__a) { return ret; } -FUNK int32x2x3_t vld3_s32(const int32_t *__a) { +__funline int32x2x3_t vld3_s32(const int32_t *__a) { int32x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v2si((const __builtin_aarch64_simd_si *)__a); @@ -12239,7 +12289,7 @@ FUNK int32x2x3_t vld3_s32(const int32_t *__a) { return ret; } -FUNK uint8x8x3_t vld3_u8(const uint8_t *__a) { +__funline uint8x8x3_t vld3_u8(const uint8_t *__a) { uint8x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -12249,7 +12299,7 @@ FUNK uint8x8x3_t vld3_u8(const uint8_t *__a) { return ret; } -FUNK uint16x4x3_t vld3_u16(const uint16_t *__a) { +__funline uint16x4x3_t vld3_u16(const uint16_t *__a) { uint16x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -12259,7 +12309,7 @@ FUNK uint16x4x3_t vld3_u16(const uint16_t *__a) { return ret; } -FUNK uint32x2x3_t vld3_u32(const uint32_t *__a) { +__funline uint32x2x3_t vld3_u32(const uint32_t *__a) { uint32x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v2si((const __builtin_aarch64_simd_si *)__a); @@ -12269,7 +12319,7 @@ FUNK uint32x2x3_t vld3_u32(const uint32_t *__a) { return ret; } -FUNK float16x4x3_t vld3_f16(const float16_t *__a) { +__funline float16x4x3_t vld3_f16(const float16_t *__a) { float16x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v4hf(__a); @@ -12279,7 +12329,7 @@ FUNK float16x4x3_t vld3_f16(const float16_t *__a) { return ret; } -FUNK float32x2x3_t vld3_f32(const float32_t *__a) { +__funline float32x2x3_t vld3_f32(const float32_t *__a) { float32x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v2sf((const __builtin_aarch64_simd_sf *)__a); @@ -12289,7 +12339,7 @@ FUNK float32x2x3_t vld3_f32(const float32_t *__a) { return ret; } -FUNK poly64x1x3_t vld3_p64(const poly64_t *__a) { +__funline poly64x1x3_t vld3_p64(const poly64_t *__a) { poly64x1x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3di((const __builtin_aarch64_simd_di *)__a); @@ -12299,7 +12349,7 @@ FUNK poly64x1x3_t vld3_p64(const poly64_t *__a) { return ret; } -FUNK int8x16x3_t vld3q_s8(const int8_t *__a) { +__funline int8x16x3_t vld3q_s8(const int8_t *__a) { int8x16x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12309,7 +12359,7 @@ FUNK int8x16x3_t vld3q_s8(const int8_t *__a) { return ret; } -FUNK poly8x16x3_t vld3q_p8(const poly8_t *__a) { +__funline poly8x16x3_t vld3q_p8(const poly8_t *__a) { poly8x16x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12319,7 +12369,7 @@ FUNK poly8x16x3_t vld3q_p8(const poly8_t *__a) { return ret; } -FUNK int16x8x3_t vld3q_s16(const int16_t *__a) { +__funline int16x8x3_t vld3q_s16(const int16_t *__a) { int16x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12329,7 +12379,7 @@ FUNK int16x8x3_t vld3q_s16(const int16_t *__a) { return ret; } -FUNK poly16x8x3_t vld3q_p16(const poly16_t *__a) { +__funline poly16x8x3_t vld3q_p16(const poly16_t *__a) { poly16x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12339,7 +12389,7 @@ FUNK poly16x8x3_t vld3q_p16(const poly16_t *__a) { return ret; } -FUNK int32x4x3_t vld3q_s32(const int32_t *__a) { +__funline int32x4x3_t vld3q_s32(const int32_t *__a) { int32x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v4si((const __builtin_aarch64_simd_si *)__a); @@ -12349,7 +12399,7 @@ FUNK int32x4x3_t vld3q_s32(const int32_t *__a) { return ret; } -FUNK int64x2x3_t vld3q_s64(const int64_t *__a) { +__funline int64x2x3_t vld3q_s64(const int64_t *__a) { int64x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v2di((const __builtin_aarch64_simd_di *)__a); @@ -12359,7 +12409,7 @@ FUNK int64x2x3_t vld3q_s64(const int64_t *__a) { return ret; } -FUNK uint8x16x3_t vld3q_u8(const uint8_t *__a) { +__funline uint8x16x3_t vld3q_u8(const uint8_t *__a) { uint8x16x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12369,7 +12419,7 @@ FUNK uint8x16x3_t vld3q_u8(const uint8_t *__a) { return ret; } -FUNK uint16x8x3_t vld3q_u16(const uint16_t *__a) { +__funline uint16x8x3_t vld3q_u16(const uint16_t *__a) { uint16x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12379,7 +12429,7 @@ FUNK uint16x8x3_t vld3q_u16(const uint16_t *__a) { return ret; } -FUNK uint32x4x3_t vld3q_u32(const uint32_t *__a) { +__funline uint32x4x3_t vld3q_u32(const uint32_t *__a) { uint32x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v4si((const __builtin_aarch64_simd_si *)__a); @@ -12389,7 +12439,7 @@ FUNK uint32x4x3_t vld3q_u32(const uint32_t *__a) { return ret; } -FUNK uint64x2x3_t vld3q_u64(const uint64_t *__a) { +__funline uint64x2x3_t vld3q_u64(const uint64_t *__a) { uint64x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v2di((const __builtin_aarch64_simd_di *)__a); @@ -12399,7 +12449,7 @@ FUNK uint64x2x3_t vld3q_u64(const uint64_t *__a) { return ret; } -FUNK float16x8x3_t vld3q_f16(const float16_t *__a) { +__funline float16x8x3_t vld3q_f16(const float16_t *__a) { float16x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v8hf(__a); @@ -12409,7 +12459,7 @@ FUNK float16x8x3_t vld3q_f16(const float16_t *__a) { return ret; } -FUNK float32x4x3_t vld3q_f32(const float32_t *__a) { +__funline float32x4x3_t vld3q_f32(const float32_t *__a) { float32x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v4sf((const __builtin_aarch64_simd_sf *)__a); @@ -12419,7 +12469,7 @@ FUNK float32x4x3_t vld3q_f32(const float32_t *__a) { return ret; } -FUNK float64x2x3_t vld3q_f64(const float64_t *__a) { +__funline float64x2x3_t vld3q_f64(const float64_t *__a) { float64x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v2df((const __builtin_aarch64_simd_df *)__a); @@ -12429,7 +12479,7 @@ FUNK float64x2x3_t vld3q_f64(const float64_t *__a) { return ret; } -FUNK poly64x2x3_t vld3q_p64(const poly64_t *__a) { +__funline poly64x2x3_t vld3q_p64(const poly64_t *__a) { poly64x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3v2di((const __builtin_aarch64_simd_di *)__a); @@ -12439,7 +12489,7 @@ FUNK poly64x2x3_t vld3q_p64(const poly64_t *__a) { return ret; } -FUNK int64x1x4_t vld4_s64(const int64_t *__a) { +__funline int64x1x4_t vld4_s64(const int64_t *__a) { int64x1x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4di((const __builtin_aarch64_simd_di *)__a); @@ -12450,7 +12500,7 @@ FUNK int64x1x4_t vld4_s64(const int64_t *__a) { return ret; } -FUNK uint64x1x4_t vld4_u64(const uint64_t *__a) { +__funline uint64x1x4_t vld4_u64(const uint64_t *__a) { uint64x1x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4di((const __builtin_aarch64_simd_di *)__a); @@ -12461,7 +12511,7 @@ FUNK uint64x1x4_t vld4_u64(const uint64_t *__a) { return ret; } -FUNK float64x1x4_t vld4_f64(const float64_t *__a) { +__funline float64x1x4_t vld4_f64(const float64_t *__a) { float64x1x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4df((const __builtin_aarch64_simd_df *)__a); @@ -12472,7 +12522,7 @@ FUNK float64x1x4_t vld4_f64(const float64_t *__a) { return ret; } -FUNK int8x8x4_t vld4_s8(const int8_t *__a) { +__funline int8x8x4_t vld4_s8(const int8_t *__a) { int8x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -12483,7 +12533,7 @@ FUNK int8x8x4_t vld4_s8(const int8_t *__a) { return ret; } -FUNK poly8x8x4_t vld4_p8(const poly8_t *__a) { +__funline poly8x8x4_t vld4_p8(const poly8_t *__a) { poly8x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -12494,7 +12544,7 @@ FUNK poly8x8x4_t vld4_p8(const poly8_t *__a) { return ret; } -FUNK int16x4x4_t vld4_s16(const int16_t *__a) { +__funline int16x4x4_t vld4_s16(const int16_t *__a) { int16x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -12505,7 +12555,7 @@ FUNK int16x4x4_t vld4_s16(const int16_t *__a) { return ret; } -FUNK poly16x4x4_t vld4_p16(const poly16_t *__a) { +__funline poly16x4x4_t vld4_p16(const poly16_t *__a) { poly16x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -12516,7 +12566,7 @@ FUNK poly16x4x4_t vld4_p16(const poly16_t *__a) { return ret; } -FUNK int32x2x4_t vld4_s32(const int32_t *__a) { +__funline int32x2x4_t vld4_s32(const int32_t *__a) { int32x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v2si((const __builtin_aarch64_simd_si *)__a); @@ -12527,7 +12577,7 @@ FUNK int32x2x4_t vld4_s32(const int32_t *__a) { return ret; } -FUNK uint8x8x4_t vld4_u8(const uint8_t *__a) { +__funline uint8x8x4_t vld4_u8(const uint8_t *__a) { uint8x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v8qi((const __builtin_aarch64_simd_qi *)__a); @@ -12538,7 +12588,7 @@ FUNK uint8x8x4_t vld4_u8(const uint8_t *__a) { return ret; } -FUNK uint16x4x4_t vld4_u16(const uint16_t *__a) { +__funline uint16x4x4_t vld4_u16(const uint16_t *__a) { uint16x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v4hi((const __builtin_aarch64_simd_hi *)__a); @@ -12549,7 +12599,7 @@ FUNK uint16x4x4_t vld4_u16(const uint16_t *__a) { return ret; } -FUNK uint32x2x4_t vld4_u32(const uint32_t *__a) { +__funline uint32x2x4_t vld4_u32(const uint32_t *__a) { uint32x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v2si((const __builtin_aarch64_simd_si *)__a); @@ -12560,7 +12610,7 @@ FUNK uint32x2x4_t vld4_u32(const uint32_t *__a) { return ret; } -FUNK float16x4x4_t vld4_f16(const float16_t *__a) { +__funline float16x4x4_t vld4_f16(const float16_t *__a) { float16x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v4hf(__a); @@ -12571,7 +12621,7 @@ FUNK float16x4x4_t vld4_f16(const float16_t *__a) { return ret; } -FUNK float32x2x4_t vld4_f32(const float32_t *__a) { +__funline float32x2x4_t vld4_f32(const float32_t *__a) { float32x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v2sf((const __builtin_aarch64_simd_sf *)__a); @@ -12582,7 +12632,7 @@ FUNK float32x2x4_t vld4_f32(const float32_t *__a) { return ret; } -FUNK poly64x1x4_t vld4_p64(const poly64_t *__a) { +__funline poly64x1x4_t vld4_p64(const poly64_t *__a) { poly64x1x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4di((const __builtin_aarch64_simd_di *)__a); @@ -12593,7 +12643,7 @@ FUNK poly64x1x4_t vld4_p64(const poly64_t *__a) { return ret; } -FUNK int8x16x4_t vld4q_s8(const int8_t *__a) { +__funline int8x16x4_t vld4q_s8(const int8_t *__a) { int8x16x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12604,7 +12654,7 @@ FUNK int8x16x4_t vld4q_s8(const int8_t *__a) { return ret; } -FUNK poly8x16x4_t vld4q_p8(const poly8_t *__a) { +__funline poly8x16x4_t vld4q_p8(const poly8_t *__a) { poly8x16x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12615,7 +12665,7 @@ FUNK poly8x16x4_t vld4q_p8(const poly8_t *__a) { return ret; } -FUNK int16x8x4_t vld4q_s16(const int16_t *__a) { +__funline int16x8x4_t vld4q_s16(const int16_t *__a) { int16x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12626,7 +12676,7 @@ FUNK int16x8x4_t vld4q_s16(const int16_t *__a) { return ret; } -FUNK poly16x8x4_t vld4q_p16(const poly16_t *__a) { +__funline poly16x8x4_t vld4q_p16(const poly16_t *__a) { poly16x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12637,7 +12687,7 @@ FUNK poly16x8x4_t vld4q_p16(const poly16_t *__a) { return ret; } -FUNK int32x4x4_t vld4q_s32(const int32_t *__a) { +__funline int32x4x4_t vld4q_s32(const int32_t *__a) { int32x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v4si((const __builtin_aarch64_simd_si *)__a); @@ -12648,7 +12698,7 @@ FUNK int32x4x4_t vld4q_s32(const int32_t *__a) { return ret; } -FUNK int64x2x4_t vld4q_s64(const int64_t *__a) { +__funline int64x2x4_t vld4q_s64(const int64_t *__a) { int64x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v2di((const __builtin_aarch64_simd_di *)__a); @@ -12659,7 +12709,7 @@ FUNK int64x2x4_t vld4q_s64(const int64_t *__a) { return ret; } -FUNK uint8x16x4_t vld4q_u8(const uint8_t *__a) { +__funline uint8x16x4_t vld4q_u8(const uint8_t *__a) { uint8x16x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12670,7 +12720,7 @@ FUNK uint8x16x4_t vld4q_u8(const uint8_t *__a) { return ret; } -FUNK uint16x8x4_t vld4q_u16(const uint16_t *__a) { +__funline uint16x8x4_t vld4q_u16(const uint16_t *__a) { uint16x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12681,7 +12731,7 @@ FUNK uint16x8x4_t vld4q_u16(const uint16_t *__a) { return ret; } -FUNK uint32x4x4_t vld4q_u32(const uint32_t *__a) { +__funline uint32x4x4_t vld4q_u32(const uint32_t *__a) { uint32x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v4si((const __builtin_aarch64_simd_si *)__a); @@ -12692,7 +12742,7 @@ FUNK uint32x4x4_t vld4q_u32(const uint32_t *__a) { return ret; } -FUNK uint64x2x4_t vld4q_u64(const uint64_t *__a) { +__funline uint64x2x4_t vld4q_u64(const uint64_t *__a) { uint64x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v2di((const __builtin_aarch64_simd_di *)__a); @@ -12703,7 +12753,7 @@ FUNK uint64x2x4_t vld4q_u64(const uint64_t *__a) { return ret; } -FUNK float16x8x4_t vld4q_f16(const float16_t *__a) { +__funline float16x8x4_t vld4q_f16(const float16_t *__a) { float16x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v8hf(__a); @@ -12714,7 +12764,7 @@ FUNK float16x8x4_t vld4q_f16(const float16_t *__a) { return ret; } -FUNK float32x4x4_t vld4q_f32(const float32_t *__a) { +__funline float32x4x4_t vld4q_f32(const float32_t *__a) { float32x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v4sf((const __builtin_aarch64_simd_sf *)__a); @@ -12725,7 +12775,7 @@ FUNK float32x4x4_t vld4q_f32(const float32_t *__a) { return ret; } -FUNK float64x2x4_t vld4q_f64(const float64_t *__a) { +__funline float64x2x4_t vld4q_f64(const float64_t *__a) { float64x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v2df((const __builtin_aarch64_simd_df *)__a); @@ -12736,7 +12786,7 @@ FUNK float64x2x4_t vld4q_f64(const float64_t *__a) { return ret; } -FUNK poly64x2x4_t vld4q_p64(const poly64_t *__a) { +__funline poly64x2x4_t vld4q_p64(const poly64_t *__a) { poly64x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4v2di((const __builtin_aarch64_simd_di *)__a); @@ -12747,7 +12797,7 @@ FUNK poly64x2x4_t vld4q_p64(const poly64_t *__a) { return ret; } -FUNK int8x8x2_t vld2_dup_s8(const int8_t *__a) { +__funline int8x8x2_t vld2_dup_s8(const int8_t *__a) { int8x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv8qi((const __builtin_aarch64_simd_qi *)__a); @@ -12756,7 +12806,7 @@ FUNK int8x8x2_t vld2_dup_s8(const int8_t *__a) { return ret; } -FUNK int16x4x2_t vld2_dup_s16(const int16_t *__a) { +__funline int16x4x2_t vld2_dup_s16(const int16_t *__a) { int16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv4hi((const __builtin_aarch64_simd_hi *)__a); @@ -12765,7 +12815,7 @@ FUNK int16x4x2_t vld2_dup_s16(const int16_t *__a) { return ret; } -FUNK int32x2x2_t vld2_dup_s32(const int32_t *__a) { +__funline int32x2x2_t vld2_dup_s32(const int32_t *__a) { int32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv2si((const __builtin_aarch64_simd_si *)__a); @@ -12774,7 +12824,7 @@ FUNK int32x2x2_t vld2_dup_s32(const int32_t *__a) { return ret; } -FUNK float16x4x2_t vld2_dup_f16(const float16_t *__a) { +__funline float16x4x2_t vld2_dup_f16(const float16_t *__a) { float16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv4hf((const __builtin_aarch64_simd_hf *)__a); @@ -12783,7 +12833,7 @@ FUNK float16x4x2_t vld2_dup_f16(const float16_t *__a) { return ret; } -FUNK float32x2x2_t vld2_dup_f32(const float32_t *__a) { +__funline float32x2x2_t vld2_dup_f32(const float32_t *__a) { float32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv2sf((const __builtin_aarch64_simd_sf *)__a); @@ -12792,7 +12842,7 @@ FUNK float32x2x2_t vld2_dup_f32(const float32_t *__a) { return ret; } -FUNK float64x1x2_t vld2_dup_f64(const float64_t *__a) { +__funline float64x1x2_t vld2_dup_f64(const float64_t *__a) { float64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rdf((const __builtin_aarch64_simd_df *)__a); @@ -12801,7 +12851,7 @@ FUNK float64x1x2_t vld2_dup_f64(const float64_t *__a) { return ret; } -FUNK uint8x8x2_t vld2_dup_u8(const uint8_t *__a) { +__funline uint8x8x2_t vld2_dup_u8(const uint8_t *__a) { uint8x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv8qi((const __builtin_aarch64_simd_qi *)__a); @@ -12810,7 +12860,7 @@ FUNK uint8x8x2_t vld2_dup_u8(const uint8_t *__a) { return ret; } -FUNK uint16x4x2_t vld2_dup_u16(const uint16_t *__a) { +__funline uint16x4x2_t vld2_dup_u16(const uint16_t *__a) { uint16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv4hi((const __builtin_aarch64_simd_hi *)__a); @@ -12819,7 +12869,7 @@ FUNK uint16x4x2_t vld2_dup_u16(const uint16_t *__a) { return ret; } -FUNK uint32x2x2_t vld2_dup_u32(const uint32_t *__a) { +__funline uint32x2x2_t vld2_dup_u32(const uint32_t *__a) { uint32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv2si((const __builtin_aarch64_simd_si *)__a); @@ -12828,7 +12878,7 @@ FUNK uint32x2x2_t vld2_dup_u32(const uint32_t *__a) { return ret; } -FUNK poly8x8x2_t vld2_dup_p8(const poly8_t *__a) { +__funline poly8x8x2_t vld2_dup_p8(const poly8_t *__a) { poly8x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv8qi((const __builtin_aarch64_simd_qi *)__a); @@ -12837,7 +12887,7 @@ FUNK poly8x8x2_t vld2_dup_p8(const poly8_t *__a) { return ret; } -FUNK poly16x4x2_t vld2_dup_p16(const poly16_t *__a) { +__funline poly16x4x2_t vld2_dup_p16(const poly16_t *__a) { poly16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv4hi((const __builtin_aarch64_simd_hi *)__a); @@ -12846,7 +12896,7 @@ FUNK poly16x4x2_t vld2_dup_p16(const poly16_t *__a) { return ret; } -FUNK poly64x1x2_t vld2_dup_p64(const poly64_t *__a) { +__funline poly64x1x2_t vld2_dup_p64(const poly64_t *__a) { poly64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv2di((const __builtin_aarch64_simd_di *)__a); @@ -12855,7 +12905,7 @@ FUNK poly64x1x2_t vld2_dup_p64(const poly64_t *__a) { return ret; } -FUNK int64x1x2_t vld2_dup_s64(const int64_t *__a) { +__funline int64x1x2_t vld2_dup_s64(const int64_t *__a) { int64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rdi((const __builtin_aarch64_simd_di *)__a); @@ -12864,7 +12914,7 @@ FUNK int64x1x2_t vld2_dup_s64(const int64_t *__a) { return ret; } -FUNK uint64x1x2_t vld2_dup_u64(const uint64_t *__a) { +__funline uint64x1x2_t vld2_dup_u64(const uint64_t *__a) { uint64x1x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rdi((const __builtin_aarch64_simd_di *)__a); @@ -12873,7 +12923,7 @@ FUNK uint64x1x2_t vld2_dup_u64(const uint64_t *__a) { return ret; } -FUNK int8x16x2_t vld2q_dup_s8(const int8_t *__a) { +__funline int8x16x2_t vld2q_dup_s8(const int8_t *__a) { int8x16x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12882,7 +12932,7 @@ FUNK int8x16x2_t vld2q_dup_s8(const int8_t *__a) { return ret; } -FUNK poly8x16x2_t vld2q_dup_p8(const poly8_t *__a) { +__funline poly8x16x2_t vld2q_dup_p8(const poly8_t *__a) { poly8x16x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12891,7 +12941,7 @@ FUNK poly8x16x2_t vld2q_dup_p8(const poly8_t *__a) { return ret; } -FUNK int16x8x2_t vld2q_dup_s16(const int16_t *__a) { +__funline int16x8x2_t vld2q_dup_s16(const int16_t *__a) { int16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12900,7 +12950,7 @@ FUNK int16x8x2_t vld2q_dup_s16(const int16_t *__a) { return ret; } -FUNK poly16x8x2_t vld2q_dup_p16(const poly16_t *__a) { +__funline poly16x8x2_t vld2q_dup_p16(const poly16_t *__a) { poly16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12909,7 +12959,7 @@ FUNK poly16x8x2_t vld2q_dup_p16(const poly16_t *__a) { return ret; } -FUNK int32x4x2_t vld2q_dup_s32(const int32_t *__a) { +__funline int32x4x2_t vld2q_dup_s32(const int32_t *__a) { int32x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv4si((const __builtin_aarch64_simd_si *)__a); @@ -12918,7 +12968,7 @@ FUNK int32x4x2_t vld2q_dup_s32(const int32_t *__a) { return ret; } -FUNK int64x2x2_t vld2q_dup_s64(const int64_t *__a) { +__funline int64x2x2_t vld2q_dup_s64(const int64_t *__a) { int64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv2di((const __builtin_aarch64_simd_di *)__a); @@ -12927,7 +12977,7 @@ FUNK int64x2x2_t vld2q_dup_s64(const int64_t *__a) { return ret; } -FUNK uint8x16x2_t vld2q_dup_u8(const uint8_t *__a) { +__funline uint8x16x2_t vld2q_dup_u8(const uint8_t *__a) { uint8x16x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv16qi((const __builtin_aarch64_simd_qi *)__a); @@ -12936,7 +12986,7 @@ FUNK uint8x16x2_t vld2q_dup_u8(const uint8_t *__a) { return ret; } -FUNK uint16x8x2_t vld2q_dup_u16(const uint16_t *__a) { +__funline uint16x8x2_t vld2q_dup_u16(const uint16_t *__a) { uint16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv8hi((const __builtin_aarch64_simd_hi *)__a); @@ -12945,7 +12995,7 @@ FUNK uint16x8x2_t vld2q_dup_u16(const uint16_t *__a) { return ret; } -FUNK uint32x4x2_t vld2q_dup_u32(const uint32_t *__a) { +__funline uint32x4x2_t vld2q_dup_u32(const uint32_t *__a) { uint32x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv4si((const __builtin_aarch64_simd_si *)__a); @@ -12954,7 +13004,7 @@ FUNK uint32x4x2_t vld2q_dup_u32(const uint32_t *__a) { return ret; } -FUNK uint64x2x2_t vld2q_dup_u64(const uint64_t *__a) { +__funline uint64x2x2_t vld2q_dup_u64(const uint64_t *__a) { uint64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv2di((const __builtin_aarch64_simd_di *)__a); @@ -12963,7 +13013,7 @@ FUNK uint64x2x2_t vld2q_dup_u64(const uint64_t *__a) { return ret; } -FUNK float16x8x2_t vld2q_dup_f16(const float16_t *__a) { +__funline float16x8x2_t vld2q_dup_f16(const float16_t *__a) { float16x8x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv8hf((const __builtin_aarch64_simd_hf *)__a); @@ -12972,7 +13022,7 @@ FUNK float16x8x2_t vld2q_dup_f16(const float16_t *__a) { return ret; } -FUNK float32x4x2_t vld2q_dup_f32(const float32_t *__a) { +__funline float32x4x2_t vld2q_dup_f32(const float32_t *__a) { float32x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv4sf((const __builtin_aarch64_simd_sf *)__a); @@ -12981,7 +13031,7 @@ FUNK float32x4x2_t vld2q_dup_f32(const float32_t *__a) { return ret; } -FUNK float64x2x2_t vld2q_dup_f64(const float64_t *__a) { +__funline float64x2x2_t vld2q_dup_f64(const float64_t *__a) { float64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv2df((const __builtin_aarch64_simd_df *)__a); @@ -12990,7 +13040,7 @@ FUNK float64x2x2_t vld2q_dup_f64(const float64_t *__a) { return ret; } -FUNK poly64x2x2_t vld2q_dup_p64(const poly64_t *__a) { +__funline poly64x2x2_t vld2q_dup_p64(const poly64_t *__a) { poly64x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2rv2di((const __builtin_aarch64_simd_di *)__a); @@ -12999,7 +13049,7 @@ FUNK poly64x2x2_t vld2q_dup_p64(const poly64_t *__a) { return ret; } -FUNK int64x1x3_t vld3_dup_s64(const int64_t *__a) { +__funline int64x1x3_t vld3_dup_s64(const int64_t *__a) { int64x1x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rdi((const __builtin_aarch64_simd_di *)__a); @@ -13009,7 +13059,7 @@ FUNK int64x1x3_t vld3_dup_s64(const int64_t *__a) { return ret; } -FUNK uint64x1x3_t vld3_dup_u64(const uint64_t *__a) { +__funline uint64x1x3_t vld3_dup_u64(const uint64_t *__a) { uint64x1x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rdi((const __builtin_aarch64_simd_di *)__a); @@ -13019,7 +13069,7 @@ FUNK uint64x1x3_t vld3_dup_u64(const uint64_t *__a) { return ret; } -FUNK float64x1x3_t vld3_dup_f64(const float64_t *__a) { +__funline float64x1x3_t vld3_dup_f64(const float64_t *__a) { float64x1x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rdf((const __builtin_aarch64_simd_df *)__a); @@ -13029,7 +13079,7 @@ FUNK float64x1x3_t vld3_dup_f64(const float64_t *__a) { return ret; } -FUNK int8x8x3_t vld3_dup_s8(const int8_t *__a) { +__funline int8x8x3_t vld3_dup_s8(const int8_t *__a) { int8x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv8qi((const __builtin_aarch64_simd_qi *)__a); @@ -13039,7 +13089,7 @@ FUNK int8x8x3_t vld3_dup_s8(const int8_t *__a) { return ret; } -FUNK poly8x8x3_t vld3_dup_p8(const poly8_t *__a) { +__funline poly8x8x3_t vld3_dup_p8(const poly8_t *__a) { poly8x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv8qi((const __builtin_aarch64_simd_qi *)__a); @@ -13049,7 +13099,7 @@ FUNK poly8x8x3_t vld3_dup_p8(const poly8_t *__a) { return ret; } -FUNK int16x4x3_t vld3_dup_s16(const int16_t *__a) { +__funline int16x4x3_t vld3_dup_s16(const int16_t *__a) { int16x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv4hi((const __builtin_aarch64_simd_hi *)__a); @@ -13059,7 +13109,7 @@ FUNK int16x4x3_t vld3_dup_s16(const int16_t *__a) { return ret; } -FUNK poly16x4x3_t vld3_dup_p16(const poly16_t *__a) { +__funline poly16x4x3_t vld3_dup_p16(const poly16_t *__a) { poly16x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv4hi((const __builtin_aarch64_simd_hi *)__a); @@ -13069,7 +13119,7 @@ FUNK poly16x4x3_t vld3_dup_p16(const poly16_t *__a) { return ret; } -FUNK int32x2x3_t vld3_dup_s32(const int32_t *__a) { +__funline int32x2x3_t vld3_dup_s32(const int32_t *__a) { int32x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv2si((const __builtin_aarch64_simd_si *)__a); @@ -13079,7 +13129,7 @@ FUNK int32x2x3_t vld3_dup_s32(const int32_t *__a) { return ret; } -FUNK uint8x8x3_t vld3_dup_u8(const uint8_t *__a) { +__funline uint8x8x3_t vld3_dup_u8(const uint8_t *__a) { uint8x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv8qi((const __builtin_aarch64_simd_qi *)__a); @@ -13089,7 +13139,7 @@ FUNK uint8x8x3_t vld3_dup_u8(const uint8_t *__a) { return ret; } -FUNK uint16x4x3_t vld3_dup_u16(const uint16_t *__a) { +__funline uint16x4x3_t vld3_dup_u16(const uint16_t *__a) { uint16x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv4hi((const __builtin_aarch64_simd_hi *)__a); @@ -13099,7 +13149,7 @@ FUNK uint16x4x3_t vld3_dup_u16(const uint16_t *__a) { return ret; } -FUNK uint32x2x3_t vld3_dup_u32(const uint32_t *__a) { +__funline uint32x2x3_t vld3_dup_u32(const uint32_t *__a) { uint32x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv2si((const __builtin_aarch64_simd_si *)__a); @@ -13109,7 +13159,7 @@ FUNK uint32x2x3_t vld3_dup_u32(const uint32_t *__a) { return ret; } -FUNK float16x4x3_t vld3_dup_f16(const float16_t *__a) { +__funline float16x4x3_t vld3_dup_f16(const float16_t *__a) { float16x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv4hf((const __builtin_aarch64_simd_hf *)__a); @@ -13119,7 +13169,7 @@ FUNK float16x4x3_t vld3_dup_f16(const float16_t *__a) { return ret; } -FUNK float32x2x3_t vld3_dup_f32(const float32_t *__a) { +__funline float32x2x3_t vld3_dup_f32(const float32_t *__a) { float32x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv2sf((const __builtin_aarch64_simd_sf *)__a); @@ -13129,7 +13179,7 @@ FUNK float32x2x3_t vld3_dup_f32(const float32_t *__a) { return ret; } -FUNK poly64x1x3_t vld3_dup_p64(const poly64_t *__a) { +__funline poly64x1x3_t vld3_dup_p64(const poly64_t *__a) { poly64x1x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv2di((const __builtin_aarch64_simd_di *)__a); @@ -13139,7 +13189,7 @@ FUNK poly64x1x3_t vld3_dup_p64(const poly64_t *__a) { return ret; } -FUNK int8x16x3_t vld3q_dup_s8(const int8_t *__a) { +__funline int8x16x3_t vld3q_dup_s8(const int8_t *__a) { int8x16x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv16qi((const __builtin_aarch64_simd_qi *)__a); @@ -13149,7 +13199,7 @@ FUNK int8x16x3_t vld3q_dup_s8(const int8_t *__a) { return ret; } -FUNK poly8x16x3_t vld3q_dup_p8(const poly8_t *__a) { +__funline poly8x16x3_t vld3q_dup_p8(const poly8_t *__a) { poly8x16x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv16qi((const __builtin_aarch64_simd_qi *)__a); @@ -13159,7 +13209,7 @@ FUNK poly8x16x3_t vld3q_dup_p8(const poly8_t *__a) { return ret; } -FUNK int16x8x3_t vld3q_dup_s16(const int16_t *__a) { +__funline int16x8x3_t vld3q_dup_s16(const int16_t *__a) { int16x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv8hi((const __builtin_aarch64_simd_hi *)__a); @@ -13169,7 +13219,7 @@ FUNK int16x8x3_t vld3q_dup_s16(const int16_t *__a) { return ret; } -FUNK poly16x8x3_t vld3q_dup_p16(const poly16_t *__a) { +__funline poly16x8x3_t vld3q_dup_p16(const poly16_t *__a) { poly16x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv8hi((const __builtin_aarch64_simd_hi *)__a); @@ -13179,7 +13229,7 @@ FUNK poly16x8x3_t vld3q_dup_p16(const poly16_t *__a) { return ret; } -FUNK int32x4x3_t vld3q_dup_s32(const int32_t *__a) { +__funline int32x4x3_t vld3q_dup_s32(const int32_t *__a) { int32x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv4si((const __builtin_aarch64_simd_si *)__a); @@ -13189,7 +13239,7 @@ FUNK int32x4x3_t vld3q_dup_s32(const int32_t *__a) { return ret; } -FUNK int64x2x3_t vld3q_dup_s64(const int64_t *__a) { +__funline int64x2x3_t vld3q_dup_s64(const int64_t *__a) { int64x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv2di((const __builtin_aarch64_simd_di *)__a); @@ -13199,7 +13249,7 @@ FUNK int64x2x3_t vld3q_dup_s64(const int64_t *__a) { return ret; } -FUNK uint8x16x3_t vld3q_dup_u8(const uint8_t *__a) { +__funline uint8x16x3_t vld3q_dup_u8(const uint8_t *__a) { uint8x16x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv16qi((const __builtin_aarch64_simd_qi *)__a); @@ -13209,7 +13259,7 @@ FUNK uint8x16x3_t vld3q_dup_u8(const uint8_t *__a) { return ret; } -FUNK uint16x8x3_t vld3q_dup_u16(const uint16_t *__a) { +__funline uint16x8x3_t vld3q_dup_u16(const uint16_t *__a) { uint16x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv8hi((const __builtin_aarch64_simd_hi *)__a); @@ -13219,7 +13269,7 @@ FUNK uint16x8x3_t vld3q_dup_u16(const uint16_t *__a) { return ret; } -FUNK uint32x4x3_t vld3q_dup_u32(const uint32_t *__a) { +__funline uint32x4x3_t vld3q_dup_u32(const uint32_t *__a) { uint32x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv4si((const __builtin_aarch64_simd_si *)__a); @@ -13229,7 +13279,7 @@ FUNK uint32x4x3_t vld3q_dup_u32(const uint32_t *__a) { return ret; } -FUNK uint64x2x3_t vld3q_dup_u64(const uint64_t *__a) { +__funline uint64x2x3_t vld3q_dup_u64(const uint64_t *__a) { uint64x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv2di((const __builtin_aarch64_simd_di *)__a); @@ -13239,7 +13289,7 @@ FUNK uint64x2x3_t vld3q_dup_u64(const uint64_t *__a) { return ret; } -FUNK float16x8x3_t vld3q_dup_f16(const float16_t *__a) { +__funline float16x8x3_t vld3q_dup_f16(const float16_t *__a) { float16x8x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv8hf((const __builtin_aarch64_simd_hf *)__a); @@ -13249,7 +13299,7 @@ FUNK float16x8x3_t vld3q_dup_f16(const float16_t *__a) { return ret; } -FUNK float32x4x3_t vld3q_dup_f32(const float32_t *__a) { +__funline float32x4x3_t vld3q_dup_f32(const float32_t *__a) { float32x4x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv4sf((const __builtin_aarch64_simd_sf *)__a); @@ -13259,7 +13309,7 @@ FUNK float32x4x3_t vld3q_dup_f32(const float32_t *__a) { return ret; } -FUNK float64x2x3_t vld3q_dup_f64(const float64_t *__a) { +__funline float64x2x3_t vld3q_dup_f64(const float64_t *__a) { float64x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv2df((const __builtin_aarch64_simd_df *)__a); @@ -13269,7 +13319,7 @@ FUNK float64x2x3_t vld3q_dup_f64(const float64_t *__a) { return ret; } -FUNK poly64x2x3_t vld3q_dup_p64(const poly64_t *__a) { +__funline poly64x2x3_t vld3q_dup_p64(const poly64_t *__a) { poly64x2x3_t ret; __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_ld3rv2di((const __builtin_aarch64_simd_di *)__a); @@ -13279,7 +13329,7 @@ FUNK poly64x2x3_t vld3q_dup_p64(const poly64_t *__a) { return ret; } -FUNK int64x1x4_t vld4_dup_s64(const int64_t *__a) { +__funline int64x1x4_t vld4_dup_s64(const int64_t *__a) { int64x1x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rdi((const __builtin_aarch64_simd_di *)__a); @@ -13290,7 +13340,7 @@ FUNK int64x1x4_t vld4_dup_s64(const int64_t *__a) { return ret; } -FUNK uint64x1x4_t vld4_dup_u64(const uint64_t *__a) { +__funline uint64x1x4_t vld4_dup_u64(const uint64_t *__a) { uint64x1x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rdi((const __builtin_aarch64_simd_di *)__a); @@ -13301,7 +13351,7 @@ FUNK uint64x1x4_t vld4_dup_u64(const uint64_t *__a) { return ret; } -FUNK float64x1x4_t vld4_dup_f64(const float64_t *__a) { +__funline float64x1x4_t vld4_dup_f64(const float64_t *__a) { float64x1x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rdf((const __builtin_aarch64_simd_df *)__a); @@ -13312,7 +13362,7 @@ FUNK float64x1x4_t vld4_dup_f64(const float64_t *__a) { return ret; } -FUNK int8x8x4_t vld4_dup_s8(const int8_t *__a) { +__funline int8x8x4_t vld4_dup_s8(const int8_t *__a) { int8x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv8qi((const __builtin_aarch64_simd_qi *)__a); @@ -13323,7 +13373,7 @@ FUNK int8x8x4_t vld4_dup_s8(const int8_t *__a) { return ret; } -FUNK poly8x8x4_t vld4_dup_p8(const poly8_t *__a) { +__funline poly8x8x4_t vld4_dup_p8(const poly8_t *__a) { poly8x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv8qi((const __builtin_aarch64_simd_qi *)__a); @@ -13334,7 +13384,7 @@ FUNK poly8x8x4_t vld4_dup_p8(const poly8_t *__a) { return ret; } -FUNK int16x4x4_t vld4_dup_s16(const int16_t *__a) { +__funline int16x4x4_t vld4_dup_s16(const int16_t *__a) { int16x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv4hi((const __builtin_aarch64_simd_hi *)__a); @@ -13345,7 +13395,7 @@ FUNK int16x4x4_t vld4_dup_s16(const int16_t *__a) { return ret; } -FUNK poly16x4x4_t vld4_dup_p16(const poly16_t *__a) { +__funline poly16x4x4_t vld4_dup_p16(const poly16_t *__a) { poly16x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv4hi((const __builtin_aarch64_simd_hi *)__a); @@ -13356,7 +13406,7 @@ FUNK poly16x4x4_t vld4_dup_p16(const poly16_t *__a) { return ret; } -FUNK int32x2x4_t vld4_dup_s32(const int32_t *__a) { +__funline int32x2x4_t vld4_dup_s32(const int32_t *__a) { int32x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv2si((const __builtin_aarch64_simd_si *)__a); @@ -13367,7 +13417,7 @@ FUNK int32x2x4_t vld4_dup_s32(const int32_t *__a) { return ret; } -FUNK uint8x8x4_t vld4_dup_u8(const uint8_t *__a) { +__funline uint8x8x4_t vld4_dup_u8(const uint8_t *__a) { uint8x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv8qi((const __builtin_aarch64_simd_qi *)__a); @@ -13378,7 +13428,7 @@ FUNK uint8x8x4_t vld4_dup_u8(const uint8_t *__a) { return ret; } -FUNK uint16x4x4_t vld4_dup_u16(const uint16_t *__a) { +__funline uint16x4x4_t vld4_dup_u16(const uint16_t *__a) { uint16x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv4hi((const __builtin_aarch64_simd_hi *)__a); @@ -13389,7 +13439,7 @@ FUNK uint16x4x4_t vld4_dup_u16(const uint16_t *__a) { return ret; } -FUNK uint32x2x4_t vld4_dup_u32(const uint32_t *__a) { +__funline uint32x2x4_t vld4_dup_u32(const uint32_t *__a) { uint32x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv2si((const __builtin_aarch64_simd_si *)__a); @@ -13400,7 +13450,7 @@ FUNK uint32x2x4_t vld4_dup_u32(const uint32_t *__a) { return ret; } -FUNK float16x4x4_t vld4_dup_f16(const float16_t *__a) { +__funline float16x4x4_t vld4_dup_f16(const float16_t *__a) { float16x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv4hf((const __builtin_aarch64_simd_hf *)__a); @@ -13411,7 +13461,7 @@ FUNK float16x4x4_t vld4_dup_f16(const float16_t *__a) { return ret; } -FUNK float32x2x4_t vld4_dup_f32(const float32_t *__a) { +__funline float32x2x4_t vld4_dup_f32(const float32_t *__a) { float32x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv2sf((const __builtin_aarch64_simd_sf *)__a); @@ -13422,7 +13472,7 @@ FUNK float32x2x4_t vld4_dup_f32(const float32_t *__a) { return ret; } -FUNK poly64x1x4_t vld4_dup_p64(const poly64_t *__a) { +__funline poly64x1x4_t vld4_dup_p64(const poly64_t *__a) { poly64x1x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv2di((const __builtin_aarch64_simd_di *)__a); @@ -13433,7 +13483,7 @@ FUNK poly64x1x4_t vld4_dup_p64(const poly64_t *__a) { return ret; } -FUNK int8x16x4_t vld4q_dup_s8(const int8_t *__a) { +__funline int8x16x4_t vld4q_dup_s8(const int8_t *__a) { int8x16x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv16qi((const __builtin_aarch64_simd_qi *)__a); @@ -13444,7 +13494,7 @@ FUNK int8x16x4_t vld4q_dup_s8(const int8_t *__a) { return ret; } -FUNK poly8x16x4_t vld4q_dup_p8(const poly8_t *__a) { +__funline poly8x16x4_t vld4q_dup_p8(const poly8_t *__a) { poly8x16x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv16qi((const __builtin_aarch64_simd_qi *)__a); @@ -13455,7 +13505,7 @@ FUNK poly8x16x4_t vld4q_dup_p8(const poly8_t *__a) { return ret; } -FUNK int16x8x4_t vld4q_dup_s16(const int16_t *__a) { +__funline int16x8x4_t vld4q_dup_s16(const int16_t *__a) { int16x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv8hi((const __builtin_aarch64_simd_hi *)__a); @@ -13466,7 +13516,7 @@ FUNK int16x8x4_t vld4q_dup_s16(const int16_t *__a) { return ret; } -FUNK poly16x8x4_t vld4q_dup_p16(const poly16_t *__a) { +__funline poly16x8x4_t vld4q_dup_p16(const poly16_t *__a) { poly16x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv8hi((const __builtin_aarch64_simd_hi *)__a); @@ -13477,7 +13527,7 @@ FUNK poly16x8x4_t vld4q_dup_p16(const poly16_t *__a) { return ret; } -FUNK int32x4x4_t vld4q_dup_s32(const int32_t *__a) { +__funline int32x4x4_t vld4q_dup_s32(const int32_t *__a) { int32x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv4si((const __builtin_aarch64_simd_si *)__a); @@ -13488,7 +13538,7 @@ FUNK int32x4x4_t vld4q_dup_s32(const int32_t *__a) { return ret; } -FUNK int64x2x4_t vld4q_dup_s64(const int64_t *__a) { +__funline int64x2x4_t vld4q_dup_s64(const int64_t *__a) { int64x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv2di((const __builtin_aarch64_simd_di *)__a); @@ -13499,7 +13549,7 @@ FUNK int64x2x4_t vld4q_dup_s64(const int64_t *__a) { return ret; } -FUNK uint8x16x4_t vld4q_dup_u8(const uint8_t *__a) { +__funline uint8x16x4_t vld4q_dup_u8(const uint8_t *__a) { uint8x16x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv16qi((const __builtin_aarch64_simd_qi *)__a); @@ -13510,7 +13560,7 @@ FUNK uint8x16x4_t vld4q_dup_u8(const uint8_t *__a) { return ret; } -FUNK uint16x8x4_t vld4q_dup_u16(const uint16_t *__a) { +__funline uint16x8x4_t vld4q_dup_u16(const uint16_t *__a) { uint16x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv8hi((const __builtin_aarch64_simd_hi *)__a); @@ -13521,7 +13571,7 @@ FUNK uint16x8x4_t vld4q_dup_u16(const uint16_t *__a) { return ret; } -FUNK uint32x4x4_t vld4q_dup_u32(const uint32_t *__a) { +__funline uint32x4x4_t vld4q_dup_u32(const uint32_t *__a) { uint32x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv4si((const __builtin_aarch64_simd_si *)__a); @@ -13532,7 +13582,7 @@ FUNK uint32x4x4_t vld4q_dup_u32(const uint32_t *__a) { return ret; } -FUNK uint64x2x4_t vld4q_dup_u64(const uint64_t *__a) { +__funline uint64x2x4_t vld4q_dup_u64(const uint64_t *__a) { uint64x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv2di((const __builtin_aarch64_simd_di *)__a); @@ -13543,7 +13593,7 @@ FUNK uint64x2x4_t vld4q_dup_u64(const uint64_t *__a) { return ret; } -FUNK float16x8x4_t vld4q_dup_f16(const float16_t *__a) { +__funline float16x8x4_t vld4q_dup_f16(const float16_t *__a) { float16x8x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv8hf((const __builtin_aarch64_simd_hf *)__a); @@ -13554,7 +13604,7 @@ FUNK float16x8x4_t vld4q_dup_f16(const float16_t *__a) { return ret; } -FUNK float32x4x4_t vld4q_dup_f32(const float32_t *__a) { +__funline float32x4x4_t vld4q_dup_f32(const float32_t *__a) { float32x4x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv4sf((const __builtin_aarch64_simd_sf *)__a); @@ -13565,7 +13615,7 @@ FUNK float32x4x4_t vld4q_dup_f32(const float32_t *__a) { return ret; } -FUNK float64x2x4_t vld4q_dup_f64(const float64_t *__a) { +__funline float64x2x4_t vld4q_dup_f64(const float64_t *__a) { float64x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv2df((const __builtin_aarch64_simd_df *)__a); @@ -13576,7 +13626,7 @@ FUNK float64x2x4_t vld4q_dup_f64(const float64_t *__a) { return ret; } -FUNK poly64x2x4_t vld4q_dup_p64(const poly64_t *__a) { +__funline poly64x2x4_t vld4q_dup_p64(const poly64_t *__a) { poly64x2x4_t ret; __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_ld4rv2di((const __builtin_aarch64_simd_di *)__a); @@ -13589,8 +13639,8 @@ FUNK poly64x2x4_t vld4q_dup_p64(const poly64_t *__a) { #define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, qmode, \ ptrmode, funcsuffix, signedtype) \ - FUNK intype vld2_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ - const int __c) { \ + __funline intype vld2_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ + const int __c) { \ __builtin_aarch64_simd_oi __o; \ largetype __temp; \ __temp.val[0] = \ @@ -13639,18 +13689,18 @@ __LD2_LANE_FUNC(uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di, #undef __LD2_LANE_FUNC -#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ - FUNK intype vld2q_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ - const int __c) { \ - __builtin_aarch64_simd_oi __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)__b.val[0], 0); \ - __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)__b.val[1], 1); \ - __o = __builtin_aarch64_ld2_lane##mode( \ - (__builtin_aarch64_simd_##ptrmode *)__ptr, __o, __c); \ - ret.val[0] = (vtype)__builtin_aarch64_get_qregoiv4si(__o, 0); \ - ret.val[1] = (vtype)__builtin_aarch64_get_qregoiv4si(__o, 1); \ - return ret; \ +#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ + __funline intype vld2q_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ + const int __c) { \ + __builtin_aarch64_simd_oi __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)__b.val[0], 0); \ + __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)__b.val[1], 1); \ + __o = __builtin_aarch64_ld2_lane##mode( \ + (__builtin_aarch64_simd_##ptrmode *)__ptr, __o, __c); \ + ret.val[0] = (vtype)__builtin_aarch64_get_qregoiv4si(__o, 0); \ + ret.val[1] = (vtype)__builtin_aarch64_get_qregoiv4si(__o, 1); \ + return ret; \ } __LD2_LANE_FUNC(float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16) @@ -13672,8 +13722,8 @@ __LD2_LANE_FUNC(uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64) #define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, qmode, \ ptrmode, funcsuffix, signedtype) \ - FUNK intype vld3_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ - const int __c) { \ + __funline intype vld3_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ + const int __c) { \ __builtin_aarch64_simd_ci __o; \ largetype __temp; \ __temp.val[0] = \ @@ -13727,20 +13777,20 @@ __LD3_LANE_FUNC(uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di, #undef __LD3_LANE_FUNC -#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ - FUNK intype vld3q_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ - const int __c) { \ - __builtin_aarch64_simd_ci __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)__b.val[0], 0); \ - __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)__b.val[1], 1); \ - __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)__b.val[2], 2); \ - __o = __builtin_aarch64_ld3_lane##mode( \ - (__builtin_aarch64_simd_##ptrmode *)__ptr, __o, __c); \ - ret.val[0] = (vtype)__builtin_aarch64_get_qregciv4si(__o, 0); \ - ret.val[1] = (vtype)__builtin_aarch64_get_qregciv4si(__o, 1); \ - ret.val[2] = (vtype)__builtin_aarch64_get_qregciv4si(__o, 2); \ - return ret; \ +#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ + __funline intype vld3q_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ + const int __c) { \ + __builtin_aarch64_simd_ci __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)__b.val[0], 0); \ + __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)__b.val[1], 1); \ + __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)__b.val[2], 2); \ + __o = __builtin_aarch64_ld3_lane##mode( \ + (__builtin_aarch64_simd_##ptrmode *)__ptr, __o, __c); \ + ret.val[0] = (vtype)__builtin_aarch64_get_qregciv4si(__o, 0); \ + ret.val[1] = (vtype)__builtin_aarch64_get_qregciv4si(__o, 1); \ + ret.val[2] = (vtype)__builtin_aarch64_get_qregciv4si(__o, 2); \ + return ret; \ } __LD3_LANE_FUNC(float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16) @@ -13762,8 +13812,8 @@ __LD3_LANE_FUNC(uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64) #define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, qmode, \ ptrmode, funcsuffix, signedtype) \ - FUNK intype vld4_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ - const int __c) { \ + __funline intype vld4_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ + const int __c) { \ __builtin_aarch64_simd_xi __o; \ largetype __temp; \ __temp.val[0] = \ @@ -13822,22 +13872,22 @@ __LD4_LANE_FUNC(uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di, #undef __LD4_LANE_FUNC -#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ - FUNK intype vld4q_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ - const int __c) { \ - __builtin_aarch64_simd_xi __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)__b.val[0], 0); \ - __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)__b.val[1], 1); \ - __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)__b.val[2], 2); \ - __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)__b.val[3], 3); \ - __o = __builtin_aarch64_ld4_lane##mode( \ - (__builtin_aarch64_simd_##ptrmode *)__ptr, __o, __c); \ - ret.val[0] = (vtype)__builtin_aarch64_get_qregxiv4si(__o, 0); \ - ret.val[1] = (vtype)__builtin_aarch64_get_qregxiv4si(__o, 1); \ - ret.val[2] = (vtype)__builtin_aarch64_get_qregxiv4si(__o, 2); \ - ret.val[3] = (vtype)__builtin_aarch64_get_qregxiv4si(__o, 3); \ - return ret; \ +#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ + __funline intype vld4q_lane_##funcsuffix(const ptrtype *__ptr, intype __b, \ + const int __c) { \ + __builtin_aarch64_simd_xi __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)__b.val[0], 0); \ + __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)__b.val[1], 1); \ + __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)__b.val[2], 2); \ + __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)__b.val[3], 3); \ + __o = __builtin_aarch64_ld4_lane##mode( \ + (__builtin_aarch64_simd_##ptrmode *)__ptr, __o, __c); \ + ret.val[0] = (vtype)__builtin_aarch64_get_qregxiv4si(__o, 0); \ + ret.val[1] = (vtype)__builtin_aarch64_get_qregxiv4si(__o, 1); \ + ret.val[2] = (vtype)__builtin_aarch64_get_qregxiv4si(__o, 2); \ + ret.val[3] = (vtype)__builtin_aarch64_get_qregxiv4si(__o, 3); \ + return ret; \ } __LD4_LANE_FUNC(float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16) @@ -13857,2331 +13907,2369 @@ __LD4_LANE_FUNC(uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) #undef __LD4_LANE_FUNC -FUNK float32x2_t vmax_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vmax_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_smax_nanv2sf(__a, __b); } -FUNK float64x1_t vmax_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vmax_f64(float64x1_t __a, float64x1_t __b) { return (float64x1_t){__builtin_aarch64_smax_nandf(vget_lane_f64(__a, 0), vget_lane_f64(__b, 0))}; } -FUNK int8x8_t vmax_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vmax_s8(int8x8_t __a, int8x8_t __b) { return __builtin_aarch64_smaxv8qi(__a, __b); } -FUNK int16x4_t vmax_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vmax_s16(int16x4_t __a, int16x4_t __b) { return __builtin_aarch64_smaxv4hi(__a, __b); } -FUNK int32x2_t vmax_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vmax_s32(int32x2_t __a, int32x2_t __b) { return __builtin_aarch64_smaxv2si(__a, __b); } -FUNK uint8x8_t vmax_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vmax_u8(uint8x8_t __a, uint8x8_t __b) { return (uint8x8_t)__builtin_aarch64_umaxv8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK uint16x4_t vmax_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vmax_u16(uint16x4_t __a, uint16x4_t __b) { return (uint16x4_t)__builtin_aarch64_umaxv4hi((int16x4_t)__a, (int16x4_t)__b); } -FUNK uint32x2_t vmax_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vmax_u32(uint32x2_t __a, uint32x2_t __b) { return (uint32x2_t)__builtin_aarch64_umaxv2si((int32x2_t)__a, (int32x2_t)__b); } -FUNK float32x4_t vmaxq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vmaxq_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_smax_nanv4sf(__a, __b); } -FUNK float64x2_t vmaxq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vmaxq_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_smax_nanv2df(__a, __b); } -FUNK int8x16_t vmaxq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vmaxq_s8(int8x16_t __a, int8x16_t __b) { return __builtin_aarch64_smaxv16qi(__a, __b); } -FUNK int16x8_t vmaxq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vmaxq_s16(int16x8_t __a, int16x8_t __b) { return __builtin_aarch64_smaxv8hi(__a, __b); } -FUNK int32x4_t vmaxq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vmaxq_s32(int32x4_t __a, int32x4_t __b) { return __builtin_aarch64_smaxv4si(__a, __b); } -FUNK uint8x16_t vmaxq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vmaxq_u8(uint8x16_t __a, uint8x16_t __b) { return (uint8x16_t)__builtin_aarch64_umaxv16qi((int8x16_t)__a, (int8x16_t)__b); } -FUNK uint16x8_t vmaxq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vmaxq_u16(uint16x8_t __a, uint16x8_t __b) { return (uint16x8_t)__builtin_aarch64_umaxv8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint32x4_t vmaxq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vmaxq_u32(uint32x4_t __a, uint32x4_t __b) { return (uint32x4_t)__builtin_aarch64_umaxv4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK float32x2_t vmulx_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vmulx_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_fmulxv2sf(__a, __b); } -FUNK float32x4_t vmulxq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vmulxq_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_fmulxv4sf(__a, __b); } -FUNK float64x1_t vmulx_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vmulx_f64(float64x1_t __a, float64x1_t __b) { return (float64x1_t){__builtin_aarch64_fmulxdf(__a[0], __b[0])}; } -FUNK float64x2_t vmulxq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vmulxq_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_fmulxv2df(__a, __b); } -FUNK float32_t vmulxs_f32(float32_t __a, float32_t __b) { +__funline float32_t vmulxs_f32(float32_t __a, float32_t __b) { return __builtin_aarch64_fmulxsf(__a, __b); } -FUNK float64_t vmulxd_f64(float64_t __a, float64_t __b) { +__funline float64_t vmulxd_f64(float64_t __a, float64_t __b) { return __builtin_aarch64_fmulxdf(__a, __b); } -FUNK float32x2_t vmulx_lane_f32(float32x2_t __a, float32x2_t __v, - const int __lane) { +__funline float32x2_t vmulx_lane_f32(float32x2_t __a, float32x2_t __v, + const int __lane) { return vmulx_f32(__a, __aarch64_vdup_lane_f32(__v, __lane)); } -FUNK float64x1_t vmulx_lane_f64(float64x1_t __a, float64x1_t __v, - const int __lane) { +__funline float64x1_t vmulx_lane_f64(float64x1_t __a, float64x1_t __v, + const int __lane) { return vmulx_f64(__a, __aarch64_vdup_lane_f64(__v, __lane)); } -FUNK float32x4_t vmulxq_lane_f32(float32x4_t __a, float32x2_t __v, - const int __lane) { +__funline float32x4_t vmulxq_lane_f32(float32x4_t __a, float32x2_t __v, + const int __lane) { return vmulxq_f32(__a, __aarch64_vdupq_lane_f32(__v, __lane)); } -FUNK float64x2_t vmulxq_lane_f64(float64x2_t __a, float64x1_t __v, - const int __lane) { +__funline float64x2_t vmulxq_lane_f64(float64x2_t __a, float64x1_t __v, + const int __lane) { return vmulxq_f64(__a, __aarch64_vdupq_lane_f64(__v, __lane)); } -FUNK float32x2_t vmulx_laneq_f32(float32x2_t __a, float32x4_t __v, - const int __lane) { +__funline float32x2_t vmulx_laneq_f32(float32x2_t __a, float32x4_t __v, + const int __lane) { return vmulx_f32(__a, __aarch64_vdup_laneq_f32(__v, __lane)); } -FUNK float64x1_t vmulx_laneq_f64(float64x1_t __a, float64x2_t __v, - const int __lane) { +__funline float64x1_t vmulx_laneq_f64(float64x1_t __a, float64x2_t __v, + const int __lane) { return vmulx_f64(__a, __aarch64_vdup_laneq_f64(__v, __lane)); } -FUNK float32x4_t vmulxq_laneq_f32(float32x4_t __a, float32x4_t __v, - const int __lane) { +__funline float32x4_t vmulxq_laneq_f32(float32x4_t __a, float32x4_t __v, + const int __lane) { return vmulxq_f32(__a, __aarch64_vdupq_laneq_f32(__v, __lane)); } -FUNK float64x2_t vmulxq_laneq_f64(float64x2_t __a, float64x2_t __v, - const int __lane) { +__funline float64x2_t vmulxq_laneq_f64(float64x2_t __a, float64x2_t __v, + const int __lane) { return vmulxq_f64(__a, __aarch64_vdupq_laneq_f64(__v, __lane)); } -FUNK float32_t vmulxs_lane_f32(float32_t __a, float32x2_t __v, - const int __lane) { +__funline float32_t vmulxs_lane_f32(float32_t __a, float32x2_t __v, + const int __lane) { return vmulxs_f32(__a, __aarch64_vget_lane_any(__v, __lane)); } -FUNK float32_t vmulxs_laneq_f32(float32_t __a, float32x4_t __v, - const int __lane) { +__funline float32_t vmulxs_laneq_f32(float32_t __a, float32x4_t __v, + const int __lane) { return vmulxs_f32(__a, __aarch64_vget_lane_any(__v, __lane)); } -FUNK float64_t vmulxd_lane_f64(float64_t __a, float64x1_t __v, - const int __lane) { +__funline float64_t vmulxd_lane_f64(float64_t __a, float64x1_t __v, + const int __lane) { return vmulxd_f64(__a, __aarch64_vget_lane_any(__v, __lane)); } -FUNK float64_t vmulxd_laneq_f64(float64_t __a, float64x2_t __v, - const int __lane) { +__funline float64_t vmulxd_laneq_f64(float64_t __a, float64x2_t __v, + const int __lane) { return vmulxd_f64(__a, __aarch64_vget_lane_any(__v, __lane)); } -FUNK int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) { +__funline int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) { return __builtin_aarch64_smaxpv8qi(a, b); } -FUNK int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) { +__funline int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) { return __builtin_aarch64_smaxpv4hi(a, b); } -FUNK int32x2_t vpmax_s32(int32x2_t a, int32x2_t b) { +__funline int32x2_t vpmax_s32(int32x2_t a, int32x2_t b) { return __builtin_aarch64_smaxpv2si(a, b); } -FUNK uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) { +__funline uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t)__builtin_aarch64_umaxpv8qi((int8x8_t)a, (int8x8_t)b); } -FUNK uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) { +__funline uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t)__builtin_aarch64_umaxpv4hi((int16x4_t)a, (int16x4_t)b); } -FUNK uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b) { +__funline uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t)__builtin_aarch64_umaxpv2si((int32x2_t)a, (int32x2_t)b); } -FUNK int8x16_t vpmaxq_s8(int8x16_t a, int8x16_t b) { +__funline int8x16_t vpmaxq_s8(int8x16_t a, int8x16_t b) { return __builtin_aarch64_smaxpv16qi(a, b); } -FUNK int16x8_t vpmaxq_s16(int16x8_t a, int16x8_t b) { +__funline int16x8_t vpmaxq_s16(int16x8_t a, int16x8_t b) { return __builtin_aarch64_smaxpv8hi(a, b); } -FUNK int32x4_t vpmaxq_s32(int32x4_t a, int32x4_t b) { +__funline int32x4_t vpmaxq_s32(int32x4_t a, int32x4_t b) { return __builtin_aarch64_smaxpv4si(a, b); } -FUNK uint8x16_t vpmaxq_u8(uint8x16_t a, uint8x16_t b) { +__funline uint8x16_t vpmaxq_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t)__builtin_aarch64_umaxpv16qi((int8x16_t)a, (int8x16_t)b); } -FUNK uint16x8_t vpmaxq_u16(uint16x8_t a, uint16x8_t b) { +__funline uint16x8_t vpmaxq_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t)__builtin_aarch64_umaxpv8hi((int16x8_t)a, (int16x8_t)b); } -FUNK uint32x4_t vpmaxq_u32(uint32x4_t a, uint32x4_t b) { +__funline uint32x4_t vpmaxq_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t)__builtin_aarch64_umaxpv4si((int32x4_t)a, (int32x4_t)b); } -FUNK float32x2_t vpmax_f32(float32x2_t a, float32x2_t b) { +__funline float32x2_t vpmax_f32(float32x2_t a, float32x2_t b) { return __builtin_aarch64_smax_nanpv2sf(a, b); } -FUNK float32x4_t vpmaxq_f32(float32x4_t a, float32x4_t b) { +__funline float32x4_t vpmaxq_f32(float32x4_t a, float32x4_t b) { return __builtin_aarch64_smax_nanpv4sf(a, b); } -FUNK float64x2_t vpmaxq_f64(float64x2_t a, float64x2_t b) { +__funline float64x2_t vpmaxq_f64(float64x2_t a, float64x2_t b) { return __builtin_aarch64_smax_nanpv2df(a, b); } -FUNK float64_t vpmaxqd_f64(float64x2_t a) { +__funline float64_t vpmaxqd_f64(float64x2_t a) { return __builtin_aarch64_reduc_smax_nan_scal_v2df(a); } -FUNK float32_t vpmaxs_f32(float32x2_t a) { +__funline float32_t vpmaxs_f32(float32x2_t a) { return __builtin_aarch64_reduc_smax_nan_scal_v2sf(a); } -FUNK float32x2_t vpmaxnm_f32(float32x2_t a, float32x2_t b) { +__funline float32x2_t vpmaxnm_f32(float32x2_t a, float32x2_t b) { return __builtin_aarch64_smaxpv2sf(a, b); } -FUNK float32x4_t vpmaxnmq_f32(float32x4_t a, float32x4_t b) { +__funline float32x4_t vpmaxnmq_f32(float32x4_t a, float32x4_t b) { return __builtin_aarch64_smaxpv4sf(a, b); } -FUNK float64x2_t vpmaxnmq_f64(float64x2_t a, float64x2_t b) { +__funline float64x2_t vpmaxnmq_f64(float64x2_t a, float64x2_t b) { return __builtin_aarch64_smaxpv2df(a, b); } -FUNK float64_t vpmaxnmqd_f64(float64x2_t a) { +__funline float64_t vpmaxnmqd_f64(float64x2_t a) { return __builtin_aarch64_reduc_smax_scal_v2df(a); } -FUNK float32_t vpmaxnms_f32(float32x2_t a) { +__funline float32_t vpmaxnms_f32(float32x2_t a) { return __builtin_aarch64_reduc_smax_scal_v2sf(a); } -FUNK int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) { +__funline int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) { return __builtin_aarch64_sminpv8qi(a, b); } -FUNK int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) { +__funline int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) { return __builtin_aarch64_sminpv4hi(a, b); } -FUNK int32x2_t vpmin_s32(int32x2_t a, int32x2_t b) { +__funline int32x2_t vpmin_s32(int32x2_t a, int32x2_t b) { return __builtin_aarch64_sminpv2si(a, b); } -FUNK uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) { +__funline uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t)__builtin_aarch64_uminpv8qi((int8x8_t)a, (int8x8_t)b); } -FUNK uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) { +__funline uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t)__builtin_aarch64_uminpv4hi((int16x4_t)a, (int16x4_t)b); } -FUNK uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b) { +__funline uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t)__builtin_aarch64_uminpv2si((int32x2_t)a, (int32x2_t)b); } -FUNK int8x16_t vpminq_s8(int8x16_t a, int8x16_t b) { +__funline int8x16_t vpminq_s8(int8x16_t a, int8x16_t b) { return __builtin_aarch64_sminpv16qi(a, b); } -FUNK int16x8_t vpminq_s16(int16x8_t a, int16x8_t b) { +__funline int16x8_t vpminq_s16(int16x8_t a, int16x8_t b) { return __builtin_aarch64_sminpv8hi(a, b); } -FUNK int32x4_t vpminq_s32(int32x4_t a, int32x4_t b) { +__funline int32x4_t vpminq_s32(int32x4_t a, int32x4_t b) { return __builtin_aarch64_sminpv4si(a, b); } -FUNK uint8x16_t vpminq_u8(uint8x16_t a, uint8x16_t b) { +__funline uint8x16_t vpminq_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t)__builtin_aarch64_uminpv16qi((int8x16_t)a, (int8x16_t)b); } -FUNK uint16x8_t vpminq_u16(uint16x8_t a, uint16x8_t b) { +__funline uint16x8_t vpminq_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t)__builtin_aarch64_uminpv8hi((int16x8_t)a, (int16x8_t)b); } -FUNK uint32x4_t vpminq_u32(uint32x4_t a, uint32x4_t b) { +__funline uint32x4_t vpminq_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t)__builtin_aarch64_uminpv4si((int32x4_t)a, (int32x4_t)b); } -FUNK float32x2_t vpmin_f32(float32x2_t a, float32x2_t b) { +__funline float32x2_t vpmin_f32(float32x2_t a, float32x2_t b) { return __builtin_aarch64_smin_nanpv2sf(a, b); } -FUNK float32x4_t vpminq_f32(float32x4_t a, float32x4_t b) { +__funline float32x4_t vpminq_f32(float32x4_t a, float32x4_t b) { return __builtin_aarch64_smin_nanpv4sf(a, b); } -FUNK float64x2_t vpminq_f64(float64x2_t a, float64x2_t b) { +__funline float64x2_t vpminq_f64(float64x2_t a, float64x2_t b) { return __builtin_aarch64_smin_nanpv2df(a, b); } -FUNK float64_t vpminqd_f64(float64x2_t a) { +__funline float64_t vpminqd_f64(float64x2_t a) { return __builtin_aarch64_reduc_smin_nan_scal_v2df(a); } -FUNK float32_t vpmins_f32(float32x2_t a) { +__funline float32_t vpmins_f32(float32x2_t a) { return __builtin_aarch64_reduc_smin_nan_scal_v2sf(a); } -FUNK float32x2_t vpminnm_f32(float32x2_t a, float32x2_t b) { +__funline float32x2_t vpminnm_f32(float32x2_t a, float32x2_t b) { return __builtin_aarch64_sminpv2sf(a, b); } -FUNK float32x4_t vpminnmq_f32(float32x4_t a, float32x4_t b) { +__funline float32x4_t vpminnmq_f32(float32x4_t a, float32x4_t b) { return __builtin_aarch64_sminpv4sf(a, b); } -FUNK float64x2_t vpminnmq_f64(float64x2_t a, float64x2_t b) { +__funline float64x2_t vpminnmq_f64(float64x2_t a, float64x2_t b) { return __builtin_aarch64_sminpv2df(a, b); } -FUNK float64_t vpminnmqd_f64(float64x2_t a) { +__funline float64_t vpminnmqd_f64(float64x2_t a) { return __builtin_aarch64_reduc_smin_scal_v2df(a); } -FUNK float32_t vpminnms_f32(float32x2_t a) { +__funline float32_t vpminnms_f32(float32x2_t a) { return __builtin_aarch64_reduc_smin_scal_v2sf(a); } -FUNK float32x2_t vmaxnm_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vmaxnm_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_fmaxv2sf(__a, __b); } -FUNK float64x1_t vmaxnm_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vmaxnm_f64(float64x1_t __a, float64x1_t __b) { return (float64x1_t){ __builtin_aarch64_fmaxdf(vget_lane_f64(__a, 0), vget_lane_f64(__b, 0))}; } -FUNK float32x4_t vmaxnmq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vmaxnmq_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_fmaxv4sf(__a, __b); } -FUNK float64x2_t vmaxnmq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vmaxnmq_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_fmaxv2df(__a, __b); } -FUNK float32_t vmaxv_f32(float32x2_t __a) { +__funline float32_t vmaxv_f32(float32x2_t __a) { return __builtin_aarch64_reduc_smax_nan_scal_v2sf(__a); } -FUNK int8_t vmaxv_s8(int8x8_t __a) { +__funline int8_t vmaxv_s8(int8x8_t __a) { return __builtin_aarch64_reduc_smax_scal_v8qi(__a); } -FUNK int16_t vmaxv_s16(int16x4_t __a) { +__funline int16_t vmaxv_s16(int16x4_t __a) { return __builtin_aarch64_reduc_smax_scal_v4hi(__a); } -FUNK int32_t vmaxv_s32(int32x2_t __a) { +__funline int32_t vmaxv_s32(int32x2_t __a) { return __builtin_aarch64_reduc_smax_scal_v2si(__a); } -FUNK uint8_t vmaxv_u8(uint8x8_t __a) { +__funline uint8_t vmaxv_u8(uint8x8_t __a) { return __builtin_aarch64_reduc_umax_scal_v8qi_uu(__a); } -FUNK uint16_t vmaxv_u16(uint16x4_t __a) { +__funline uint16_t vmaxv_u16(uint16x4_t __a) { return __builtin_aarch64_reduc_umax_scal_v4hi_uu(__a); } -FUNK uint32_t vmaxv_u32(uint32x2_t __a) { +__funline uint32_t vmaxv_u32(uint32x2_t __a) { return __builtin_aarch64_reduc_umax_scal_v2si_uu(__a); } -FUNK float32_t vmaxvq_f32(float32x4_t __a) { +__funline float32_t vmaxvq_f32(float32x4_t __a) { return __builtin_aarch64_reduc_smax_nan_scal_v4sf(__a); } -FUNK float64_t vmaxvq_f64(float64x2_t __a) { +__funline float64_t vmaxvq_f64(float64x2_t __a) { return __builtin_aarch64_reduc_smax_nan_scal_v2df(__a); } -FUNK int8_t vmaxvq_s8(int8x16_t __a) { +__funline int8_t vmaxvq_s8(int8x16_t __a) { return __builtin_aarch64_reduc_smax_scal_v16qi(__a); } -FUNK int16_t vmaxvq_s16(int16x8_t __a) { +__funline int16_t vmaxvq_s16(int16x8_t __a) { return __builtin_aarch64_reduc_smax_scal_v8hi(__a); } -FUNK int32_t vmaxvq_s32(int32x4_t __a) { +__funline int32_t vmaxvq_s32(int32x4_t __a) { return __builtin_aarch64_reduc_smax_scal_v4si(__a); } -FUNK uint8_t vmaxvq_u8(uint8x16_t __a) { +__funline uint8_t vmaxvq_u8(uint8x16_t __a) { return __builtin_aarch64_reduc_umax_scal_v16qi_uu(__a); } -FUNK uint16_t vmaxvq_u16(uint16x8_t __a) { +__funline uint16_t vmaxvq_u16(uint16x8_t __a) { return __builtin_aarch64_reduc_umax_scal_v8hi_uu(__a); } -FUNK uint32_t vmaxvq_u32(uint32x4_t __a) { +__funline uint32_t vmaxvq_u32(uint32x4_t __a) { return __builtin_aarch64_reduc_umax_scal_v4si_uu(__a); } -FUNK float32_t vmaxnmv_f32(float32x2_t __a) { +__funline float32_t vmaxnmv_f32(float32x2_t __a) { return __builtin_aarch64_reduc_smax_scal_v2sf(__a); } -FUNK float32_t vmaxnmvq_f32(float32x4_t __a) { +__funline float32_t vmaxnmvq_f32(float32x4_t __a) { return __builtin_aarch64_reduc_smax_scal_v4sf(__a); } -FUNK float64_t vmaxnmvq_f64(float64x2_t __a) { +__funline float64_t vmaxnmvq_f64(float64x2_t __a) { return __builtin_aarch64_reduc_smax_scal_v2df(__a); } -FUNK float32x2_t vmin_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vmin_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_smin_nanv2sf(__a, __b); } -FUNK float64x1_t vmin_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vmin_f64(float64x1_t __a, float64x1_t __b) { return (float64x1_t){__builtin_aarch64_smin_nandf(vget_lane_f64(__a, 0), vget_lane_f64(__b, 0))}; } -FUNK int8x8_t vmin_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vmin_s8(int8x8_t __a, int8x8_t __b) { return __builtin_aarch64_sminv8qi(__a, __b); } -FUNK int16x4_t vmin_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vmin_s16(int16x4_t __a, int16x4_t __b) { return __builtin_aarch64_sminv4hi(__a, __b); } -FUNK int32x2_t vmin_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vmin_s32(int32x2_t __a, int32x2_t __b) { return __builtin_aarch64_sminv2si(__a, __b); } -FUNK uint8x8_t vmin_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vmin_u8(uint8x8_t __a, uint8x8_t __b) { return (uint8x8_t)__builtin_aarch64_uminv8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK uint16x4_t vmin_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vmin_u16(uint16x4_t __a, uint16x4_t __b) { return (uint16x4_t)__builtin_aarch64_uminv4hi((int16x4_t)__a, (int16x4_t)__b); } -FUNK uint32x2_t vmin_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vmin_u32(uint32x2_t __a, uint32x2_t __b) { return (uint32x2_t)__builtin_aarch64_uminv2si((int32x2_t)__a, (int32x2_t)__b); } -FUNK float32x4_t vminq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vminq_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_smin_nanv4sf(__a, __b); } -FUNK float64x2_t vminq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vminq_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_smin_nanv2df(__a, __b); } -FUNK int8x16_t vminq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vminq_s8(int8x16_t __a, int8x16_t __b) { return __builtin_aarch64_sminv16qi(__a, __b); } -FUNK int16x8_t vminq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vminq_s16(int16x8_t __a, int16x8_t __b) { return __builtin_aarch64_sminv8hi(__a, __b); } -FUNK int32x4_t vminq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vminq_s32(int32x4_t __a, int32x4_t __b) { return __builtin_aarch64_sminv4si(__a, __b); } -FUNK uint8x16_t vminq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vminq_u8(uint8x16_t __a, uint8x16_t __b) { return (uint8x16_t)__builtin_aarch64_uminv16qi((int8x16_t)__a, (int8x16_t)__b); } -FUNK uint16x8_t vminq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vminq_u16(uint16x8_t __a, uint16x8_t __b) { return (uint16x8_t)__builtin_aarch64_uminv8hi((int16x8_t)__a, (int16x8_t)__b); } -FUNK uint32x4_t vminq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vminq_u32(uint32x4_t __a, uint32x4_t __b) { return (uint32x4_t)__builtin_aarch64_uminv4si((int32x4_t)__a, (int32x4_t)__b); } -FUNK float32x2_t vminnm_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vminnm_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_fminv2sf(__a, __b); } -FUNK float64x1_t vminnm_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vminnm_f64(float64x1_t __a, float64x1_t __b) { return (float64x1_t){ __builtin_aarch64_fmindf(vget_lane_f64(__a, 0), vget_lane_f64(__b, 0))}; } -FUNK float32x4_t vminnmq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vminnmq_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_fminv4sf(__a, __b); } -FUNK float64x2_t vminnmq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vminnmq_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_fminv2df(__a, __b); } -FUNK float32_t vminv_f32(float32x2_t __a) { +__funline float32_t vminv_f32(float32x2_t __a) { return __builtin_aarch64_reduc_smin_nan_scal_v2sf(__a); } -FUNK int8_t vminv_s8(int8x8_t __a) { +__funline int8_t vminv_s8(int8x8_t __a) { return __builtin_aarch64_reduc_smin_scal_v8qi(__a); } -FUNK int16_t vminv_s16(int16x4_t __a) { +__funline int16_t vminv_s16(int16x4_t __a) { return __builtin_aarch64_reduc_smin_scal_v4hi(__a); } -FUNK int32_t vminv_s32(int32x2_t __a) { +__funline int32_t vminv_s32(int32x2_t __a) { return __builtin_aarch64_reduc_smin_scal_v2si(__a); } -FUNK uint8_t vminv_u8(uint8x8_t __a) { +__funline uint8_t vminv_u8(uint8x8_t __a) { return __builtin_aarch64_reduc_umin_scal_v8qi_uu(__a); } -FUNK uint16_t vminv_u16(uint16x4_t __a) { +__funline uint16_t vminv_u16(uint16x4_t __a) { return __builtin_aarch64_reduc_umin_scal_v4hi_uu(__a); } -FUNK uint32_t vminv_u32(uint32x2_t __a) { +__funline uint32_t vminv_u32(uint32x2_t __a) { return __builtin_aarch64_reduc_umin_scal_v2si_uu(__a); } -FUNK float32_t vminvq_f32(float32x4_t __a) { +__funline float32_t vminvq_f32(float32x4_t __a) { return __builtin_aarch64_reduc_smin_nan_scal_v4sf(__a); } -FUNK float64_t vminvq_f64(float64x2_t __a) { +__funline float64_t vminvq_f64(float64x2_t __a) { return __builtin_aarch64_reduc_smin_nan_scal_v2df(__a); } -FUNK int8_t vminvq_s8(int8x16_t __a) { +__funline int8_t vminvq_s8(int8x16_t __a) { return __builtin_aarch64_reduc_smin_scal_v16qi(__a); } -FUNK int16_t vminvq_s16(int16x8_t __a) { +__funline int16_t vminvq_s16(int16x8_t __a) { return __builtin_aarch64_reduc_smin_scal_v8hi(__a); } -FUNK int32_t vminvq_s32(int32x4_t __a) { +__funline int32_t vminvq_s32(int32x4_t __a) { return __builtin_aarch64_reduc_smin_scal_v4si(__a); } -FUNK uint8_t vminvq_u8(uint8x16_t __a) { +__funline uint8_t vminvq_u8(uint8x16_t __a) { return __builtin_aarch64_reduc_umin_scal_v16qi_uu(__a); } -FUNK uint16_t vminvq_u16(uint16x8_t __a) { +__funline uint16_t vminvq_u16(uint16x8_t __a) { return __builtin_aarch64_reduc_umin_scal_v8hi_uu(__a); } -FUNK uint32_t vminvq_u32(uint32x4_t __a) { +__funline uint32_t vminvq_u32(uint32x4_t __a) { return __builtin_aarch64_reduc_umin_scal_v4si_uu(__a); } -FUNK float32_t vminnmv_f32(float32x2_t __a) { +__funline float32_t vminnmv_f32(float32x2_t __a) { return __builtin_aarch64_reduc_smin_scal_v2sf(__a); } -FUNK float32_t vminnmvq_f32(float32x4_t __a) { +__funline float32_t vminnmvq_f32(float32x4_t __a) { return __builtin_aarch64_reduc_smin_scal_v4sf(__a); } -FUNK float64_t vminnmvq_f64(float64x2_t __a) { +__funline float64_t vminnmvq_f64(float64x2_t __a) { return __builtin_aarch64_reduc_smin_scal_v2df(__a); } -FUNK float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) { +__funline float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return a + b * c; } -FUNK float64x1_t vmla_f64(float64x1_t __a, float64x1_t __b, float64x1_t __c) { +__funline float64x1_t vmla_f64(float64x1_t __a, float64x1_t __b, + float64x1_t __c) { return __a + __b * __c; } -FUNK float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { +__funline float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return a + b * c; } -FUNK float64x2_t vmlaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) { +__funline float64x2_t vmlaq_f64(float64x2_t a, float64x2_t b, float64x2_t c) { return a + b * c; } -FUNK float32x2_t vmla_lane_f32(float32x2_t __a, float32x2_t __b, - float32x2_t __c, const int __lane) { +__funline float32x2_t vmla_lane_f32(float32x2_t __a, float32x2_t __b, + float32x2_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int16x4_t vmla_lane_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c, - const int __lane) { +__funline int16x4_t vmla_lane_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c, + const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int32x2_t vmla_lane_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c, - const int __lane) { +__funline int32x2_t vmla_lane_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c, + const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint16x4_t vmla_lane_u16(uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, - const int __lane) { +__funline uint16x4_t vmla_lane_u16(uint16x4_t __a, uint16x4_t __b, + uint16x4_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint32x2_t vmla_lane_u32(uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, - const int __lane) { +__funline uint32x2_t vmla_lane_u32(uint32x2_t __a, uint32x2_t __b, + uint32x2_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK float32x2_t vmla_laneq_f32(float32x2_t __a, float32x2_t __b, - float32x4_t __c, const int __lane) { +__funline float32x2_t vmla_laneq_f32(float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int16x4_t vmla_laneq_s16(int16x4_t __a, int16x4_t __b, int16x8_t __c, - const int __lane) { +__funline int16x4_t vmla_laneq_s16(int16x4_t __a, int16x4_t __b, int16x8_t __c, + const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int32x2_t vmla_laneq_s32(int32x2_t __a, int32x2_t __b, int32x4_t __c, - const int __lane) { +__funline int32x2_t vmla_laneq_s32(int32x2_t __a, int32x2_t __b, int32x4_t __c, + const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint16x4_t vmla_laneq_u16(uint16x4_t __a, uint16x4_t __b, uint16x8_t __c, - const int __lane) { +__funline uint16x4_t vmla_laneq_u16(uint16x4_t __a, uint16x4_t __b, + uint16x8_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint32x2_t vmla_laneq_u32(uint32x2_t __a, uint32x2_t __b, uint32x4_t __c, - const int __lane) { +__funline uint32x2_t vmla_laneq_u32(uint32x2_t __a, uint32x2_t __b, + uint32x4_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK float32x4_t vmlaq_lane_f32(float32x4_t __a, float32x4_t __b, - float32x2_t __c, const int __lane) { +__funline float32x4_t vmlaq_lane_f32(float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int16x8_t vmlaq_lane_s16(int16x8_t __a, int16x8_t __b, int16x4_t __c, - const int __lane) { +__funline int16x8_t vmlaq_lane_s16(int16x8_t __a, int16x8_t __b, int16x4_t __c, + const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int32x4_t vmlaq_lane_s32(int32x4_t __a, int32x4_t __b, int32x2_t __c, - const int __lane) { +__funline int32x4_t vmlaq_lane_s32(int32x4_t __a, int32x4_t __b, int32x2_t __c, + const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint16x8_t vmlaq_lane_u16(uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, - const int __lane) { +__funline uint16x8_t vmlaq_lane_u16(uint16x8_t __a, uint16x8_t __b, + uint16x4_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint32x4_t vmlaq_lane_u32(uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, - const int __lane) { +__funline uint32x4_t vmlaq_lane_u32(uint32x4_t __a, uint32x4_t __b, + uint32x2_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK float32x4_t vmlaq_laneq_f32(float32x4_t __a, float32x4_t __b, - float32x4_t __c, const int __lane) { +__funline float32x4_t vmlaq_laneq_f32(float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int16x8_t vmlaq_laneq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c, - const int __lane) { +__funline int16x8_t vmlaq_laneq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c, + const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int32x4_t vmlaq_laneq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c, - const int __lane) { +__funline int32x4_t vmlaq_laneq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c, + const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint16x8_t vmlaq_laneq_u16(uint16x8_t __a, uint16x8_t __b, uint16x8_t __c, - const int __lane) { +__funline uint16x8_t vmlaq_laneq_u16(uint16x8_t __a, uint16x8_t __b, + uint16x8_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint32x4_t vmlaq_laneq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c, - const int __lane) { +__funline uint32x4_t vmlaq_laneq_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c, const int __lane) { return (__a + (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) { +__funline float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return a - b * c; } -FUNK float64x1_t vmls_f64(float64x1_t __a, float64x1_t __b, float64x1_t __c) { +__funline float64x1_t vmls_f64(float64x1_t __a, float64x1_t __b, + float64x1_t __c) { return __a - __b * __c; } -FUNK float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { +__funline float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return a - b * c; } -FUNK float64x2_t vmlsq_f64(float64x2_t a, float64x2_t b, float64x2_t c) { +__funline float64x2_t vmlsq_f64(float64x2_t a, float64x2_t b, float64x2_t c) { return a - b * c; } -FUNK float32x2_t vmls_lane_f32(float32x2_t __a, float32x2_t __b, - float32x2_t __c, const int __lane) { +__funline float32x2_t vmls_lane_f32(float32x2_t __a, float32x2_t __b, + float32x2_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int16x4_t vmls_lane_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c, - const int __lane) { +__funline int16x4_t vmls_lane_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c, + const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int32x2_t vmls_lane_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c, - const int __lane) { +__funline int32x2_t vmls_lane_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c, + const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint16x4_t vmls_lane_u16(uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, - const int __lane) { +__funline uint16x4_t vmls_lane_u16(uint16x4_t __a, uint16x4_t __b, + uint16x4_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint32x2_t vmls_lane_u32(uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, - const int __lane) { +__funline uint32x2_t vmls_lane_u32(uint32x2_t __a, uint32x2_t __b, + uint32x2_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK float32x2_t vmls_laneq_f32(float32x2_t __a, float32x2_t __b, - float32x4_t __c, const int __lane) { +__funline float32x2_t vmls_laneq_f32(float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int16x4_t vmls_laneq_s16(int16x4_t __a, int16x4_t __b, int16x8_t __c, - const int __lane) { +__funline int16x4_t vmls_laneq_s16(int16x4_t __a, int16x4_t __b, int16x8_t __c, + const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int32x2_t vmls_laneq_s32(int32x2_t __a, int32x2_t __b, int32x4_t __c, - const int __lane) { +__funline int32x2_t vmls_laneq_s32(int32x2_t __a, int32x2_t __b, int32x4_t __c, + const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint16x4_t vmls_laneq_u16(uint16x4_t __a, uint16x4_t __b, uint16x8_t __c, - const int __lane) { +__funline uint16x4_t vmls_laneq_u16(uint16x4_t __a, uint16x4_t __b, + uint16x8_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint32x2_t vmls_laneq_u32(uint32x2_t __a, uint32x2_t __b, uint32x4_t __c, - const int __lane) { +__funline uint32x2_t vmls_laneq_u32(uint32x2_t __a, uint32x2_t __b, + uint32x4_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK float32x4_t vmlsq_lane_f32(float32x4_t __a, float32x4_t __b, - float32x2_t __c, const int __lane) { +__funline float32x4_t vmlsq_lane_f32(float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int16x8_t vmlsq_lane_s16(int16x8_t __a, int16x8_t __b, int16x4_t __c, - const int __lane) { +__funline int16x8_t vmlsq_lane_s16(int16x8_t __a, int16x8_t __b, int16x4_t __c, + const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int32x4_t vmlsq_lane_s32(int32x4_t __a, int32x4_t __b, int32x2_t __c, - const int __lane) { +__funline int32x4_t vmlsq_lane_s32(int32x4_t __a, int32x4_t __b, int32x2_t __c, + const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint16x8_t vmlsq_lane_u16(uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, - const int __lane) { +__funline uint16x8_t vmlsq_lane_u16(uint16x8_t __a, uint16x8_t __b, + uint16x4_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint32x4_t vmlsq_lane_u32(uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, - const int __lane) { +__funline uint32x4_t vmlsq_lane_u32(uint32x4_t __a, uint32x4_t __b, + uint32x2_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK float32x4_t vmlsq_laneq_f32(float32x4_t __a, float32x4_t __b, - float32x4_t __c, const int __lane) { +__funline float32x4_t vmlsq_laneq_f32(float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int16x8_t vmlsq_laneq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c, - const int __lane) { +__funline int16x8_t vmlsq_laneq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c, + const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK int32x4_t vmlsq_laneq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c, - const int __lane) { +__funline int32x4_t vmlsq_laneq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c, + const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint16x8_t vmlsq_laneq_u16(uint16x8_t __a, uint16x8_t __b, uint16x8_t __c, - const int __lane) { +__funline uint16x8_t vmlsq_laneq_u16(uint16x8_t __a, uint16x8_t __b, + uint16x8_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK uint32x4_t vmlsq_laneq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c, - const int __lane) { +__funline uint32x4_t vmlsq_laneq_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c, const int __lane) { return (__a - (__b * __aarch64_vget_lane_any(__c, __lane))); } -FUNK float16x4_t vmov_n_f16(float16_t __a) { +__funline float16x4_t vmov_n_f16(float16_t __a) { return vdup_n_f16(__a); } -FUNK float32x2_t vmov_n_f32(float32_t __a) { +__funline float32x2_t vmov_n_f32(float32_t __a) { return vdup_n_f32(__a); } -FUNK float64x1_t vmov_n_f64(float64_t __a) { +__funline float64x1_t vmov_n_f64(float64_t __a) { return (float64x1_t){__a}; } -FUNK poly8x8_t vmov_n_p8(poly8_t __a) { +__funline poly8x8_t vmov_n_p8(poly8_t __a) { return vdup_n_p8(__a); } -FUNK poly16x4_t vmov_n_p16(poly16_t __a) { +__funline poly16x4_t vmov_n_p16(poly16_t __a) { return vdup_n_p16(__a); } -FUNK poly64x1_t vmov_n_p64(poly64_t __a) { +__funline poly64x1_t vmov_n_p64(poly64_t __a) { return vdup_n_p64(__a); } -FUNK int8x8_t vmov_n_s8(int8_t __a) { +__funline int8x8_t vmov_n_s8(int8_t __a) { return vdup_n_s8(__a); } -FUNK int16x4_t vmov_n_s16(int16_t __a) { +__funline int16x4_t vmov_n_s16(int16_t __a) { return vdup_n_s16(__a); } -FUNK int32x2_t vmov_n_s32(int32_t __a) { +__funline int32x2_t vmov_n_s32(int32_t __a) { return vdup_n_s32(__a); } -FUNK int64x1_t vmov_n_s64(int64_t __a) { +__funline int64x1_t vmov_n_s64(int64_t __a) { return (int64x1_t){__a}; } -FUNK uint8x8_t vmov_n_u8(uint8_t __a) { +__funline uint8x8_t vmov_n_u8(uint8_t __a) { return vdup_n_u8(__a); } -FUNK uint16x4_t vmov_n_u16(uint16_t __a) { +__funline uint16x4_t vmov_n_u16(uint16_t __a) { return vdup_n_u16(__a); } -FUNK uint32x2_t vmov_n_u32(uint32_t __a) { +__funline uint32x2_t vmov_n_u32(uint32_t __a) { return vdup_n_u32(__a); } -FUNK uint64x1_t vmov_n_u64(uint64_t __a) { +__funline uint64x1_t vmov_n_u64(uint64_t __a) { return (uint64x1_t){__a}; } -FUNK float16x8_t vmovq_n_f16(float16_t __a) { +__funline float16x8_t vmovq_n_f16(float16_t __a) { return vdupq_n_f16(__a); } -FUNK float32x4_t vmovq_n_f32(float32_t __a) { +__funline float32x4_t vmovq_n_f32(float32_t __a) { return vdupq_n_f32(__a); } -FUNK float64x2_t vmovq_n_f64(float64_t __a) { +__funline float64x2_t vmovq_n_f64(float64_t __a) { return vdupq_n_f64(__a); } -FUNK poly8x16_t vmovq_n_p8(poly8_t __a) { +__funline poly8x16_t vmovq_n_p8(poly8_t __a) { return vdupq_n_p8(__a); } -FUNK poly16x8_t vmovq_n_p16(poly16_t __a) { +__funline poly16x8_t vmovq_n_p16(poly16_t __a) { return vdupq_n_p16(__a); } -FUNK poly64x2_t vmovq_n_p64(poly64_t __a) { +__funline poly64x2_t vmovq_n_p64(poly64_t __a) { return vdupq_n_p64(__a); } -FUNK int8x16_t vmovq_n_s8(int8_t __a) { +__funline int8x16_t vmovq_n_s8(int8_t __a) { return vdupq_n_s8(__a); } -FUNK int16x8_t vmovq_n_s16(int16_t __a) { +__funline int16x8_t vmovq_n_s16(int16_t __a) { return vdupq_n_s16(__a); } -FUNK int32x4_t vmovq_n_s32(int32_t __a) { +__funline int32x4_t vmovq_n_s32(int32_t __a) { return vdupq_n_s32(__a); } -FUNK int64x2_t vmovq_n_s64(int64_t __a) { +__funline int64x2_t vmovq_n_s64(int64_t __a) { return vdupq_n_s64(__a); } -FUNK uint8x16_t vmovq_n_u8(uint8_t __a) { +__funline uint8x16_t vmovq_n_u8(uint8_t __a) { return vdupq_n_u8(__a); } -FUNK uint16x8_t vmovq_n_u16(uint16_t __a) { +__funline uint16x8_t vmovq_n_u16(uint16_t __a) { return vdupq_n_u16(__a); } -FUNK uint32x4_t vmovq_n_u32(uint32_t __a) { +__funline uint32x4_t vmovq_n_u32(uint32_t __a) { return vdupq_n_u32(__a); } -FUNK uint64x2_t vmovq_n_u64(uint64_t __a) { +__funline uint64x2_t vmovq_n_u64(uint64_t __a) { return vdupq_n_u64(__a); } -FUNK float32x2_t vmul_lane_f32(float32x2_t __a, float32x2_t __b, - const int __lane) { +__funline float32x2_t vmul_lane_f32(float32x2_t __a, float32x2_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float64x1_t vmul_lane_f64(float64x1_t __a, float64x1_t __b, - const int __lane) { +__funline float64x1_t vmul_lane_f64(float64x1_t __a, float64x1_t __b, + const int __lane) { return __a * __b; } -FUNK int16x4_t vmul_lane_s16(int16x4_t __a, int16x4_t __b, const int __lane) { +__funline int16x4_t vmul_lane_s16(int16x4_t __a, int16x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK int32x2_t vmul_lane_s32(int32x2_t __a, int32x2_t __b, const int __lane) { +__funline int32x2_t vmul_lane_s32(int32x2_t __a, int32x2_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK uint16x4_t vmul_lane_u16(uint16x4_t __a, uint16x4_t __b, - const int __lane) { +__funline uint16x4_t vmul_lane_u16(uint16x4_t __a, uint16x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK uint32x2_t vmul_lane_u32(uint32x2_t __a, uint32x2_t __b, - const int __lane) { +__funline uint32x2_t vmul_lane_u32(uint32x2_t __a, uint32x2_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float64_t vmuld_lane_f64(float64_t __a, float64x1_t __b, - const int __lane) { +__funline float64_t vmuld_lane_f64(float64_t __a, float64x1_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float64_t vmuld_laneq_f64(float64_t __a, float64x2_t __b, - const int __lane) { +__funline float64_t vmuld_laneq_f64(float64_t __a, float64x2_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float32_t vmuls_lane_f32(float32_t __a, float32x2_t __b, - const int __lane) { +__funline float32_t vmuls_lane_f32(float32_t __a, float32x2_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float32_t vmuls_laneq_f32(float32_t __a, float32x4_t __b, - const int __lane) { +__funline float32_t vmuls_laneq_f32(float32_t __a, float32x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float32x2_t vmul_laneq_f32(float32x2_t __a, float32x4_t __b, - const int __lane) { +__funline float32x2_t vmul_laneq_f32(float32x2_t __a, float32x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float64x1_t vmul_laneq_f64(float64x1_t __a, float64x2_t __b, - const int __lane) { +__funline float64x1_t vmul_laneq_f64(float64x1_t __a, float64x2_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK int16x4_t vmul_laneq_s16(int16x4_t __a, int16x8_t __b, const int __lane) { +__funline int16x4_t vmul_laneq_s16(int16x4_t __a, int16x8_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK int32x2_t vmul_laneq_s32(int32x2_t __a, int32x4_t __b, const int __lane) { +__funline int32x2_t vmul_laneq_s32(int32x2_t __a, int32x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK uint16x4_t vmul_laneq_u16(uint16x4_t __a, uint16x8_t __b, - const int __lane) { +__funline uint16x4_t vmul_laneq_u16(uint16x4_t __a, uint16x8_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK uint32x2_t vmul_laneq_u32(uint32x2_t __a, uint32x4_t __b, - const int __lane) { +__funline uint32x2_t vmul_laneq_u32(uint32x2_t __a, uint32x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float64x1_t vmul_n_f64(float64x1_t __a, float64_t __b) { +__funline float64x1_t vmul_n_f64(float64x1_t __a, float64_t __b) { return (float64x1_t){vget_lane_f64(__a, 0) * __b}; } -FUNK float32x4_t vmulq_lane_f32(float32x4_t __a, float32x2_t __b, - const int __lane) { +__funline float32x4_t vmulq_lane_f32(float32x4_t __a, float32x2_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float64x2_t vmulq_lane_f64(float64x2_t __a, float64x1_t __b, - const int __lane) { +__funline float64x2_t vmulq_lane_f64(float64x2_t __a, float64x1_t __b, + const int __lane) { __AARCH64_LANE_CHECK(__a, __lane); return __a * __b[0]; } -FUNK int16x8_t vmulq_lane_s16(int16x8_t __a, int16x4_t __b, const int __lane) { +__funline int16x8_t vmulq_lane_s16(int16x8_t __a, int16x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK int32x4_t vmulq_lane_s32(int32x4_t __a, int32x2_t __b, const int __lane) { +__funline int32x4_t vmulq_lane_s32(int32x4_t __a, int32x2_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK uint16x8_t vmulq_lane_u16(uint16x8_t __a, uint16x4_t __b, - const int __lane) { +__funline uint16x8_t vmulq_lane_u16(uint16x8_t __a, uint16x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK uint32x4_t vmulq_lane_u32(uint32x4_t __a, uint32x2_t __b, - const int __lane) { +__funline uint32x4_t vmulq_lane_u32(uint32x4_t __a, uint32x2_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float32x4_t vmulq_laneq_f32(float32x4_t __a, float32x4_t __b, - const int __lane) { +__funline float32x4_t vmulq_laneq_f32(float32x4_t __a, float32x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float64x2_t vmulq_laneq_f64(float64x2_t __a, float64x2_t __b, - const int __lane) { +__funline float64x2_t vmulq_laneq_f64(float64x2_t __a, float64x2_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK int16x8_t vmulq_laneq_s16(int16x8_t __a, int16x8_t __b, const int __lane) { +__funline int16x8_t vmulq_laneq_s16(int16x8_t __a, int16x8_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK int32x4_t vmulq_laneq_s32(int32x4_t __a, int32x4_t __b, const int __lane) { +__funline int32x4_t vmulq_laneq_s32(int32x4_t __a, int32x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK uint16x8_t vmulq_laneq_u16(uint16x8_t __a, uint16x8_t __b, - const int __lane) { +__funline uint16x8_t vmulq_laneq_u16(uint16x8_t __a, uint16x8_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK uint32x4_t vmulq_laneq_u32(uint32x4_t __a, uint32x4_t __b, - const int __lane) { +__funline uint32x4_t vmulq_laneq_u32(uint32x4_t __a, uint32x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float32x2_t vmul_n_f32(float32x2_t __a, float32_t __b) { +__funline float32x2_t vmul_n_f32(float32x2_t __a, float32_t __b) { return __a * __b; } -FUNK float32x4_t vmulq_n_f32(float32x4_t __a, float32_t __b) { +__funline float32x4_t vmulq_n_f32(float32x4_t __a, float32_t __b) { return __a * __b; } -FUNK float64x2_t vmulq_n_f64(float64x2_t __a, float64_t __b) { +__funline float64x2_t vmulq_n_f64(float64x2_t __a, float64_t __b) { return __a * __b; } -FUNK int16x4_t vmul_n_s16(int16x4_t __a, int16_t __b) { +__funline int16x4_t vmul_n_s16(int16x4_t __a, int16_t __b) { return __a * __b; } -FUNK int16x8_t vmulq_n_s16(int16x8_t __a, int16_t __b) { +__funline int16x8_t vmulq_n_s16(int16x8_t __a, int16_t __b) { return __a * __b; } -FUNK int32x2_t vmul_n_s32(int32x2_t __a, int32_t __b) { +__funline int32x2_t vmul_n_s32(int32x2_t __a, int32_t __b) { return __a * __b; } -FUNK int32x4_t vmulq_n_s32(int32x4_t __a, int32_t __b) { +__funline int32x4_t vmulq_n_s32(int32x4_t __a, int32_t __b) { return __a * __b; } -FUNK uint16x4_t vmul_n_u16(uint16x4_t __a, uint16_t __b) { +__funline uint16x4_t vmul_n_u16(uint16x4_t __a, uint16_t __b) { return __a * __b; } -FUNK uint16x8_t vmulq_n_u16(uint16x8_t __a, uint16_t __b) { +__funline uint16x8_t vmulq_n_u16(uint16x8_t __a, uint16_t __b) { return __a * __b; } -FUNK uint32x2_t vmul_n_u32(uint32x2_t __a, uint32_t __b) { +__funline uint32x2_t vmul_n_u32(uint32x2_t __a, uint32_t __b) { return __a * __b; } -FUNK uint32x4_t vmulq_n_u32(uint32x4_t __a, uint32_t __b) { +__funline uint32x4_t vmulq_n_u32(uint32x4_t __a, uint32_t __b) { return __a * __b; } -FUNK poly8x8_t vmvn_p8(poly8x8_t __a) { +__funline poly8x8_t vmvn_p8(poly8x8_t __a) { return (poly8x8_t) ~((int8x8_t)__a); } -FUNK int8x8_t vmvn_s8(int8x8_t __a) { +__funline int8x8_t vmvn_s8(int8x8_t __a) { return ~__a; } -FUNK int16x4_t vmvn_s16(int16x4_t __a) { +__funline int16x4_t vmvn_s16(int16x4_t __a) { return ~__a; } -FUNK int32x2_t vmvn_s32(int32x2_t __a) { +__funline int32x2_t vmvn_s32(int32x2_t __a) { return ~__a; } -FUNK uint8x8_t vmvn_u8(uint8x8_t __a) { +__funline uint8x8_t vmvn_u8(uint8x8_t __a) { return ~__a; } -FUNK uint16x4_t vmvn_u16(uint16x4_t __a) { +__funline uint16x4_t vmvn_u16(uint16x4_t __a) { return ~__a; } -FUNK uint32x2_t vmvn_u32(uint32x2_t __a) { +__funline uint32x2_t vmvn_u32(uint32x2_t __a) { return ~__a; } -FUNK poly8x16_t vmvnq_p8(poly8x16_t __a) { +__funline poly8x16_t vmvnq_p8(poly8x16_t __a) { return (poly8x16_t) ~((int8x16_t)__a); } -FUNK int8x16_t vmvnq_s8(int8x16_t __a) { +__funline int8x16_t vmvnq_s8(int8x16_t __a) { return ~__a; } -FUNK int16x8_t vmvnq_s16(int16x8_t __a) { +__funline int16x8_t vmvnq_s16(int16x8_t __a) { return ~__a; } -FUNK int32x4_t vmvnq_s32(int32x4_t __a) { +__funline int32x4_t vmvnq_s32(int32x4_t __a) { return ~__a; } -FUNK uint8x16_t vmvnq_u8(uint8x16_t __a) { +__funline uint8x16_t vmvnq_u8(uint8x16_t __a) { return ~__a; } -FUNK uint16x8_t vmvnq_u16(uint16x8_t __a) { +__funline uint16x8_t vmvnq_u16(uint16x8_t __a) { return ~__a; } -FUNK uint32x4_t vmvnq_u32(uint32x4_t __a) { +__funline uint32x4_t vmvnq_u32(uint32x4_t __a) { return ~__a; } -FUNK float32x2_t vneg_f32(float32x2_t __a) { +__funline float32x2_t vneg_f32(float32x2_t __a) { return -__a; } -FUNK float64x1_t vneg_f64(float64x1_t __a) { +__funline float64x1_t vneg_f64(float64x1_t __a) { return -__a; } -FUNK int8x8_t vneg_s8(int8x8_t __a) { +__funline int8x8_t vneg_s8(int8x8_t __a) { return -__a; } -FUNK int16x4_t vneg_s16(int16x4_t __a) { +__funline int16x4_t vneg_s16(int16x4_t __a) { return -__a; } -FUNK int32x2_t vneg_s32(int32x2_t __a) { +__funline int32x2_t vneg_s32(int32x2_t __a) { return -__a; } -FUNK int64x1_t vneg_s64(int64x1_t __a) { +__funline int64x1_t vneg_s64(int64x1_t __a) { return -__a; } -FUNK int64_t vnegd_s64(int64_t __a) { +__funline int64_t vnegd_s64(int64_t __a) { return -(uint64_t)__a; } -FUNK float32x4_t vnegq_f32(float32x4_t __a) { +__funline float32x4_t vnegq_f32(float32x4_t __a) { return -__a; } -FUNK float64x2_t vnegq_f64(float64x2_t __a) { +__funline float64x2_t vnegq_f64(float64x2_t __a) { return -__a; } -FUNK int8x16_t vnegq_s8(int8x16_t __a) { +__funline int8x16_t vnegq_s8(int8x16_t __a) { return -__a; } -FUNK int16x8_t vnegq_s16(int16x8_t __a) { +__funline int16x8_t vnegq_s16(int16x8_t __a) { return -__a; } -FUNK int32x4_t vnegq_s32(int32x4_t __a) { +__funline int32x4_t vnegq_s32(int32x4_t __a) { return -__a; } -FUNK int64x2_t vnegq_s64(int64x2_t __a) { +__funline int64x2_t vnegq_s64(int64x2_t __a) { return -__a; } -FUNK float32x2_t vpadd_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vpadd_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_faddpv2sf(__a, __b); } -FUNK float32x4_t vpaddq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vpaddq_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_faddpv4sf(__a, __b); } -FUNK float64x2_t vpaddq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vpaddq_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_faddpv2df(__a, __b); } -FUNK int8x8_t vpadd_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vpadd_s8(int8x8_t __a, int8x8_t __b) { return __builtin_aarch64_addpv8qi(__a, __b); } -FUNK int16x4_t vpadd_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vpadd_s16(int16x4_t __a, int16x4_t __b) { return __builtin_aarch64_addpv4hi(__a, __b); } -FUNK int32x2_t vpadd_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vpadd_s32(int32x2_t __a, int32x2_t __b) { return __builtin_aarch64_addpv2si(__a, __b); } -FUNK uint8x8_t vpadd_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vpadd_u8(uint8x8_t __a, uint8x8_t __b) { return (uint8x8_t)__builtin_aarch64_addpv8qi((int8x8_t)__a, (int8x8_t)__b); } -FUNK uint16x4_t vpadd_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vpadd_u16(uint16x4_t __a, uint16x4_t __b) { return (uint16x4_t)__builtin_aarch64_addpv4hi((int16x4_t)__a, (int16x4_t)__b); } -FUNK uint32x2_t vpadd_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vpadd_u32(uint32x2_t __a, uint32x2_t __b) { return (uint32x2_t)__builtin_aarch64_addpv2si((int32x2_t)__a, (int32x2_t)__b); } -FUNK float32_t vpadds_f32(float32x2_t __a) { +__funline float32_t vpadds_f32(float32x2_t __a) { return __builtin_aarch64_reduc_plus_scal_v2sf(__a); } -FUNK float64_t vpaddd_f64(float64x2_t __a) { +__funline float64_t vpaddd_f64(float64x2_t __a) { return __builtin_aarch64_reduc_plus_scal_v2df(__a); } -FUNK int64_t vpaddd_s64(int64x2_t __a) { +__funline int64_t vpaddd_s64(int64x2_t __a) { return __builtin_aarch64_addpdi(__a); } -FUNK uint64_t vpaddd_u64(uint64x2_t __a) { +__funline uint64_t vpaddd_u64(uint64x2_t __a) { return __builtin_aarch64_addpdi((int64x2_t)__a); } -FUNK int64x2_t vqabsq_s64(int64x2_t __a) { +__funline int64x2_t vqabsq_s64(int64x2_t __a) { return (int64x2_t)__builtin_aarch64_sqabsv2di(__a); } -FUNK int8_t vqabsb_s8(int8_t __a) { +__funline int8_t vqabsb_s8(int8_t __a) { return (int8_t)__builtin_aarch64_sqabsqi(__a); } -FUNK int16_t vqabsh_s16(int16_t __a) { +__funline int16_t vqabsh_s16(int16_t __a) { return (int16_t)__builtin_aarch64_sqabshi(__a); } -FUNK int32_t vqabss_s32(int32_t __a) { +__funline int32_t vqabss_s32(int32_t __a) { return (int32_t)__builtin_aarch64_sqabssi(__a); } -FUNK int64_t vqabsd_s64(int64_t __a) { +__funline int64_t vqabsd_s64(int64_t __a) { return __builtin_aarch64_sqabsdi(__a); } -FUNK int8_t vqaddb_s8(int8_t __a, int8_t __b) { +__funline int8_t vqaddb_s8(int8_t __a, int8_t __b) { return (int8_t)__builtin_aarch64_sqaddqi(__a, __b); } -FUNK int16_t vqaddh_s16(int16_t __a, int16_t __b) { +__funline int16_t vqaddh_s16(int16_t __a, int16_t __b) { return (int16_t)__builtin_aarch64_sqaddhi(__a, __b); } -FUNK int32_t vqadds_s32(int32_t __a, int32_t __b) { +__funline int32_t vqadds_s32(int32_t __a, int32_t __b) { return (int32_t)__builtin_aarch64_sqaddsi(__a, __b); } -FUNK int64_t vqaddd_s64(int64_t __a, int64_t __b) { +__funline int64_t vqaddd_s64(int64_t __a, int64_t __b) { return __builtin_aarch64_sqadddi(__a, __b); } -FUNK uint8_t vqaddb_u8(uint8_t __a, uint8_t __b) { +__funline uint8_t vqaddb_u8(uint8_t __a, uint8_t __b) { return (uint8_t)__builtin_aarch64_uqaddqi_uuu(__a, __b); } -FUNK uint16_t vqaddh_u16(uint16_t __a, uint16_t __b) { +__funline uint16_t vqaddh_u16(uint16_t __a, uint16_t __b) { return (uint16_t)__builtin_aarch64_uqaddhi_uuu(__a, __b); } -FUNK uint32_t vqadds_u32(uint32_t __a, uint32_t __b) { +__funline uint32_t vqadds_u32(uint32_t __a, uint32_t __b) { return (uint32_t)__builtin_aarch64_uqaddsi_uuu(__a, __b); } -FUNK uint64_t vqaddd_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vqaddd_u64(uint64_t __a, uint64_t __b) { return __builtin_aarch64_uqadddi_uuu(__a, __b); } -FUNK int32x4_t vqdmlal_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c) { +__funline int32x4_t vqdmlal_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c) { return __builtin_aarch64_sqdmlalv4hi(__a, __b, __c); } -FUNK int32x4_t vqdmlal_high_s16(int32x4_t __a, int16x8_t __b, int16x8_t __c) { +__funline int32x4_t vqdmlal_high_s16(int32x4_t __a, int16x8_t __b, + int16x8_t __c) { return __builtin_aarch64_sqdmlal2v8hi(__a, __b, __c); } -FUNK int32x4_t vqdmlal_high_lane_s16(int32x4_t __a, int16x8_t __b, - int16x4_t __c, int const __d) { +__funline int32x4_t vqdmlal_high_lane_s16(int32x4_t __a, int16x8_t __b, + int16x4_t __c, int const __d) { return __builtin_aarch64_sqdmlal2_lanev8hi(__a, __b, __c, __d); } -FUNK int32x4_t vqdmlal_high_laneq_s16(int32x4_t __a, int16x8_t __b, - int16x8_t __c, int const __d) { +__funline int32x4_t vqdmlal_high_laneq_s16(int32x4_t __a, int16x8_t __b, + int16x8_t __c, int const __d) { return __builtin_aarch64_sqdmlal2_laneqv8hi(__a, __b, __c, __d); } -FUNK int32x4_t vqdmlal_high_n_s16(int32x4_t __a, int16x8_t __b, int16_t __c) { +__funline int32x4_t vqdmlal_high_n_s16(int32x4_t __a, int16x8_t __b, + int16_t __c) { return __builtin_aarch64_sqdmlal2_nv8hi(__a, __b, __c); } -FUNK int32x4_t vqdmlal_lane_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c, - int const __d) { +__funline int32x4_t vqdmlal_lane_s16(int32x4_t __a, int16x4_t __b, + int16x4_t __c, int const __d) { return __builtin_aarch64_sqdmlal_lanev4hi(__a, __b, __c, __d); } -FUNK int32x4_t vqdmlal_laneq_s16(int32x4_t __a, int16x4_t __b, int16x8_t __c, - int const __d) { +__funline int32x4_t vqdmlal_laneq_s16(int32x4_t __a, int16x4_t __b, + int16x8_t __c, int const __d) { return __builtin_aarch64_sqdmlal_laneqv4hi(__a, __b, __c, __d); } -FUNK int32x4_t vqdmlal_n_s16(int32x4_t __a, int16x4_t __b, int16_t __c) { +__funline int32x4_t vqdmlal_n_s16(int32x4_t __a, int16x4_t __b, int16_t __c) { return __builtin_aarch64_sqdmlal_nv4hi(__a, __b, __c); } -FUNK int64x2_t vqdmlal_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c) { +__funline int64x2_t vqdmlal_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c) { return __builtin_aarch64_sqdmlalv2si(__a, __b, __c); } -FUNK int64x2_t vqdmlal_high_s32(int64x2_t __a, int32x4_t __b, int32x4_t __c) { +__funline int64x2_t vqdmlal_high_s32(int64x2_t __a, int32x4_t __b, + int32x4_t __c) { return __builtin_aarch64_sqdmlal2v4si(__a, __b, __c); } -FUNK int64x2_t vqdmlal_high_lane_s32(int64x2_t __a, int32x4_t __b, - int32x2_t __c, int const __d) { +__funline int64x2_t vqdmlal_high_lane_s32(int64x2_t __a, int32x4_t __b, + int32x2_t __c, int const __d) { return __builtin_aarch64_sqdmlal2_lanev4si(__a, __b, __c, __d); } -FUNK int64x2_t vqdmlal_high_laneq_s32(int64x2_t __a, int32x4_t __b, - int32x4_t __c, int const __d) { +__funline int64x2_t vqdmlal_high_laneq_s32(int64x2_t __a, int32x4_t __b, + int32x4_t __c, int const __d) { return __builtin_aarch64_sqdmlal2_laneqv4si(__a, __b, __c, __d); } -FUNK int64x2_t vqdmlal_high_n_s32(int64x2_t __a, int32x4_t __b, int32_t __c) { +__funline int64x2_t vqdmlal_high_n_s32(int64x2_t __a, int32x4_t __b, + int32_t __c) { return __builtin_aarch64_sqdmlal2_nv4si(__a, __b, __c); } -FUNK int64x2_t vqdmlal_lane_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c, - int const __d) { +__funline int64x2_t vqdmlal_lane_s32(int64x2_t __a, int32x2_t __b, + int32x2_t __c, int const __d) { return __builtin_aarch64_sqdmlal_lanev2si(__a, __b, __c, __d); } -FUNK int64x2_t vqdmlal_laneq_s32(int64x2_t __a, int32x2_t __b, int32x4_t __c, - int const __d) { +__funline int64x2_t vqdmlal_laneq_s32(int64x2_t __a, int32x2_t __b, + int32x4_t __c, int const __d) { return __builtin_aarch64_sqdmlal_laneqv2si(__a, __b, __c, __d); } -FUNK int64x2_t vqdmlal_n_s32(int64x2_t __a, int32x2_t __b, int32_t __c) { +__funline int64x2_t vqdmlal_n_s32(int64x2_t __a, int32x2_t __b, int32_t __c) { return __builtin_aarch64_sqdmlal_nv2si(__a, __b, __c); } -FUNK int32_t vqdmlalh_s16(int32_t __a, int16_t __b, int16_t __c) { +__funline int32_t vqdmlalh_s16(int32_t __a, int16_t __b, int16_t __c) { return __builtin_aarch64_sqdmlalhi(__a, __b, __c); } -FUNK int32_t vqdmlalh_lane_s16(int32_t __a, int16_t __b, int16x4_t __c, - const int __d) { +__funline int32_t vqdmlalh_lane_s16(int32_t __a, int16_t __b, int16x4_t __c, + const int __d) { return __builtin_aarch64_sqdmlal_lanehi(__a, __b, __c, __d); } -FUNK int32_t vqdmlalh_laneq_s16(int32_t __a, int16_t __b, int16x8_t __c, - const int __d) { +__funline int32_t vqdmlalh_laneq_s16(int32_t __a, int16_t __b, int16x8_t __c, + const int __d) { return __builtin_aarch64_sqdmlal_laneqhi(__a, __b, __c, __d); } -FUNK int64_t vqdmlals_s32(int64_t __a, int32_t __b, int32_t __c) { +__funline int64_t vqdmlals_s32(int64_t __a, int32_t __b, int32_t __c) { return __builtin_aarch64_sqdmlalsi(__a, __b, __c); } -FUNK int64_t vqdmlals_lane_s32(int64_t __a, int32_t __b, int32x2_t __c, - const int __d) { +__funline int64_t vqdmlals_lane_s32(int64_t __a, int32_t __b, int32x2_t __c, + const int __d) { return __builtin_aarch64_sqdmlal_lanesi(__a, __b, __c, __d); } -FUNK int64_t vqdmlals_laneq_s32(int64_t __a, int32_t __b, int32x4_t __c, - const int __d) { +__funline int64_t vqdmlals_laneq_s32(int64_t __a, int32_t __b, int32x4_t __c, + const int __d) { return __builtin_aarch64_sqdmlal_laneqsi(__a, __b, __c, __d); } -FUNK int32x4_t vqdmlsl_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c) { +__funline int32x4_t vqdmlsl_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c) { return __builtin_aarch64_sqdmlslv4hi(__a, __b, __c); } -FUNK int32x4_t vqdmlsl_high_s16(int32x4_t __a, int16x8_t __b, int16x8_t __c) { +__funline int32x4_t vqdmlsl_high_s16(int32x4_t __a, int16x8_t __b, + int16x8_t __c) { return __builtin_aarch64_sqdmlsl2v8hi(__a, __b, __c); } -FUNK int32x4_t vqdmlsl_high_lane_s16(int32x4_t __a, int16x8_t __b, - int16x4_t __c, int const __d) { +__funline int32x4_t vqdmlsl_high_lane_s16(int32x4_t __a, int16x8_t __b, + int16x4_t __c, int const __d) { return __builtin_aarch64_sqdmlsl2_lanev8hi(__a, __b, __c, __d); } -FUNK int32x4_t vqdmlsl_high_laneq_s16(int32x4_t __a, int16x8_t __b, - int16x8_t __c, int const __d) { +__funline int32x4_t vqdmlsl_high_laneq_s16(int32x4_t __a, int16x8_t __b, + int16x8_t __c, int const __d) { return __builtin_aarch64_sqdmlsl2_laneqv8hi(__a, __b, __c, __d); } -FUNK int32x4_t vqdmlsl_high_n_s16(int32x4_t __a, int16x8_t __b, int16_t __c) { +__funline int32x4_t vqdmlsl_high_n_s16(int32x4_t __a, int16x8_t __b, + int16_t __c) { return __builtin_aarch64_sqdmlsl2_nv8hi(__a, __b, __c); } -FUNK int32x4_t vqdmlsl_lane_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c, - int const __d) { +__funline int32x4_t vqdmlsl_lane_s16(int32x4_t __a, int16x4_t __b, + int16x4_t __c, int const __d) { return __builtin_aarch64_sqdmlsl_lanev4hi(__a, __b, __c, __d); } -FUNK int32x4_t vqdmlsl_laneq_s16(int32x4_t __a, int16x4_t __b, int16x8_t __c, - int const __d) { +__funline int32x4_t vqdmlsl_laneq_s16(int32x4_t __a, int16x4_t __b, + int16x8_t __c, int const __d) { return __builtin_aarch64_sqdmlsl_laneqv4hi(__a, __b, __c, __d); } -FUNK int32x4_t vqdmlsl_n_s16(int32x4_t __a, int16x4_t __b, int16_t __c) { +__funline int32x4_t vqdmlsl_n_s16(int32x4_t __a, int16x4_t __b, int16_t __c) { return __builtin_aarch64_sqdmlsl_nv4hi(__a, __b, __c); } -FUNK int64x2_t vqdmlsl_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c) { +__funline int64x2_t vqdmlsl_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c) { return __builtin_aarch64_sqdmlslv2si(__a, __b, __c); } -FUNK int64x2_t vqdmlsl_high_s32(int64x2_t __a, int32x4_t __b, int32x4_t __c) { +__funline int64x2_t vqdmlsl_high_s32(int64x2_t __a, int32x4_t __b, + int32x4_t __c) { return __builtin_aarch64_sqdmlsl2v4si(__a, __b, __c); } -FUNK int64x2_t vqdmlsl_high_lane_s32(int64x2_t __a, int32x4_t __b, - int32x2_t __c, int const __d) { +__funline int64x2_t vqdmlsl_high_lane_s32(int64x2_t __a, int32x4_t __b, + int32x2_t __c, int const __d) { return __builtin_aarch64_sqdmlsl2_lanev4si(__a, __b, __c, __d); } -FUNK int64x2_t vqdmlsl_high_laneq_s32(int64x2_t __a, int32x4_t __b, - int32x4_t __c, int const __d) { +__funline int64x2_t vqdmlsl_high_laneq_s32(int64x2_t __a, int32x4_t __b, + int32x4_t __c, int const __d) { return __builtin_aarch64_sqdmlsl2_laneqv4si(__a, __b, __c, __d); } -FUNK int64x2_t vqdmlsl_high_n_s32(int64x2_t __a, int32x4_t __b, int32_t __c) { +__funline int64x2_t vqdmlsl_high_n_s32(int64x2_t __a, int32x4_t __b, + int32_t __c) { return __builtin_aarch64_sqdmlsl2_nv4si(__a, __b, __c); } -FUNK int64x2_t vqdmlsl_lane_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c, - int const __d) { +__funline int64x2_t vqdmlsl_lane_s32(int64x2_t __a, int32x2_t __b, + int32x2_t __c, int const __d) { return __builtin_aarch64_sqdmlsl_lanev2si(__a, __b, __c, __d); } -FUNK int64x2_t vqdmlsl_laneq_s32(int64x2_t __a, int32x2_t __b, int32x4_t __c, - int const __d) { +__funline int64x2_t vqdmlsl_laneq_s32(int64x2_t __a, int32x2_t __b, + int32x4_t __c, int const __d) { return __builtin_aarch64_sqdmlsl_laneqv2si(__a, __b, __c, __d); } -FUNK int64x2_t vqdmlsl_n_s32(int64x2_t __a, int32x2_t __b, int32_t __c) { +__funline int64x2_t vqdmlsl_n_s32(int64x2_t __a, int32x2_t __b, int32_t __c) { return __builtin_aarch64_sqdmlsl_nv2si(__a, __b, __c); } -FUNK int32_t vqdmlslh_s16(int32_t __a, int16_t __b, int16_t __c) { +__funline int32_t vqdmlslh_s16(int32_t __a, int16_t __b, int16_t __c) { return __builtin_aarch64_sqdmlslhi(__a, __b, __c); } -FUNK int32_t vqdmlslh_lane_s16(int32_t __a, int16_t __b, int16x4_t __c, - const int __d) { +__funline int32_t vqdmlslh_lane_s16(int32_t __a, int16_t __b, int16x4_t __c, + const int __d) { return __builtin_aarch64_sqdmlsl_lanehi(__a, __b, __c, __d); } -FUNK int32_t vqdmlslh_laneq_s16(int32_t __a, int16_t __b, int16x8_t __c, - const int __d) { +__funline int32_t vqdmlslh_laneq_s16(int32_t __a, int16_t __b, int16x8_t __c, + const int __d) { return __builtin_aarch64_sqdmlsl_laneqhi(__a, __b, __c, __d); } -FUNK int64_t vqdmlsls_s32(int64_t __a, int32_t __b, int32_t __c) { +__funline int64_t vqdmlsls_s32(int64_t __a, int32_t __b, int32_t __c) { return __builtin_aarch64_sqdmlslsi(__a, __b, __c); } -FUNK int64_t vqdmlsls_lane_s32(int64_t __a, int32_t __b, int32x2_t __c, - const int __d) { +__funline int64_t vqdmlsls_lane_s32(int64_t __a, int32_t __b, int32x2_t __c, + const int __d) { return __builtin_aarch64_sqdmlsl_lanesi(__a, __b, __c, __d); } -FUNK int64_t vqdmlsls_laneq_s32(int64_t __a, int32_t __b, int32x4_t __c, - const int __d) { +__funline int64_t vqdmlsls_laneq_s32(int64_t __a, int32_t __b, int32x4_t __c, + const int __d) { return __builtin_aarch64_sqdmlsl_laneqsi(__a, __b, __c, __d); } -FUNK int16x4_t vqdmulh_lane_s16(int16x4_t __a, int16x4_t __b, const int __c) { +__funline int16x4_t vqdmulh_lane_s16(int16x4_t __a, int16x4_t __b, + const int __c) { return __builtin_aarch64_sqdmulh_lanev4hi(__a, __b, __c); } -FUNK int32x2_t vqdmulh_lane_s32(int32x2_t __a, int32x2_t __b, const int __c) { +__funline int32x2_t vqdmulh_lane_s32(int32x2_t __a, int32x2_t __b, + const int __c) { return __builtin_aarch64_sqdmulh_lanev2si(__a, __b, __c); } -FUNK int16x8_t vqdmulhq_lane_s16(int16x8_t __a, int16x4_t __b, const int __c) { +__funline int16x8_t vqdmulhq_lane_s16(int16x8_t __a, int16x4_t __b, + const int __c) { return __builtin_aarch64_sqdmulh_lanev8hi(__a, __b, __c); } -FUNK int32x4_t vqdmulhq_lane_s32(int32x4_t __a, int32x2_t __b, const int __c) { +__funline int32x4_t vqdmulhq_lane_s32(int32x4_t __a, int32x2_t __b, + const int __c) { return __builtin_aarch64_sqdmulh_lanev4si(__a, __b, __c); } -FUNK int16_t vqdmulhh_s16(int16_t __a, int16_t __b) { +__funline int16_t vqdmulhh_s16(int16_t __a, int16_t __b) { return (int16_t)__builtin_aarch64_sqdmulhhi(__a, __b); } -FUNK int16_t vqdmulhh_lane_s16(int16_t __a, int16x4_t __b, const int __c) { +__funline int16_t vqdmulhh_lane_s16(int16_t __a, int16x4_t __b, const int __c) { return __builtin_aarch64_sqdmulh_lanehi(__a, __b, __c); } -FUNK int16_t vqdmulhh_laneq_s16(int16_t __a, int16x8_t __b, const int __c) { +__funline int16_t vqdmulhh_laneq_s16(int16_t __a, int16x8_t __b, + const int __c) { return __builtin_aarch64_sqdmulh_laneqhi(__a, __b, __c); } -FUNK int32_t vqdmulhs_s32(int32_t __a, int32_t __b) { +__funline int32_t vqdmulhs_s32(int32_t __a, int32_t __b) { return (int32_t)__builtin_aarch64_sqdmulhsi(__a, __b); } -FUNK int32_t vqdmulhs_lane_s32(int32_t __a, int32x2_t __b, const int __c) { +__funline int32_t vqdmulhs_lane_s32(int32_t __a, int32x2_t __b, const int __c) { return __builtin_aarch64_sqdmulh_lanesi(__a, __b, __c); } -FUNK int32_t vqdmulhs_laneq_s32(int32_t __a, int32x4_t __b, const int __c) { +__funline int32_t vqdmulhs_laneq_s32(int32_t __a, int32x4_t __b, + const int __c) { return __builtin_aarch64_sqdmulh_laneqsi(__a, __b, __c); } -FUNK int32x4_t vqdmull_s16(int16x4_t __a, int16x4_t __b) { +__funline int32x4_t vqdmull_s16(int16x4_t __a, int16x4_t __b) { return __builtin_aarch64_sqdmullv4hi(__a, __b); } -FUNK int32x4_t vqdmull_high_s16(int16x8_t __a, int16x8_t __b) { +__funline int32x4_t vqdmull_high_s16(int16x8_t __a, int16x8_t __b) { return __builtin_aarch64_sqdmull2v8hi(__a, __b); } -FUNK int32x4_t vqdmull_high_lane_s16(int16x8_t __a, int16x4_t __b, - int const __c) { +__funline int32x4_t vqdmull_high_lane_s16(int16x8_t __a, int16x4_t __b, + int const __c) { return __builtin_aarch64_sqdmull2_lanev8hi(__a, __b, __c); } -FUNK int32x4_t vqdmull_high_laneq_s16(int16x8_t __a, int16x8_t __b, - int const __c) { +__funline int32x4_t vqdmull_high_laneq_s16(int16x8_t __a, int16x8_t __b, + int const __c) { return __builtin_aarch64_sqdmull2_laneqv8hi(__a, __b, __c); } -FUNK int32x4_t vqdmull_high_n_s16(int16x8_t __a, int16_t __b) { +__funline int32x4_t vqdmull_high_n_s16(int16x8_t __a, int16_t __b) { return __builtin_aarch64_sqdmull2_nv8hi(__a, __b); } -FUNK int32x4_t vqdmull_lane_s16(int16x4_t __a, int16x4_t __b, int const __c) { +__funline int32x4_t vqdmull_lane_s16(int16x4_t __a, int16x4_t __b, + int const __c) { return __builtin_aarch64_sqdmull_lanev4hi(__a, __b, __c); } -FUNK int32x4_t vqdmull_laneq_s16(int16x4_t __a, int16x8_t __b, int const __c) { +__funline int32x4_t vqdmull_laneq_s16(int16x4_t __a, int16x8_t __b, + int const __c) { return __builtin_aarch64_sqdmull_laneqv4hi(__a, __b, __c); } -FUNK int32x4_t vqdmull_n_s16(int16x4_t __a, int16_t __b) { +__funline int32x4_t vqdmull_n_s16(int16x4_t __a, int16_t __b) { return __builtin_aarch64_sqdmull_nv4hi(__a, __b); } -FUNK int64x2_t vqdmull_s32(int32x2_t __a, int32x2_t __b) { +__funline int64x2_t vqdmull_s32(int32x2_t __a, int32x2_t __b) { return __builtin_aarch64_sqdmullv2si(__a, __b); } -FUNK int64x2_t vqdmull_high_s32(int32x4_t __a, int32x4_t __b) { +__funline int64x2_t vqdmull_high_s32(int32x4_t __a, int32x4_t __b) { return __builtin_aarch64_sqdmull2v4si(__a, __b); } -FUNK int64x2_t vqdmull_high_lane_s32(int32x4_t __a, int32x2_t __b, - int const __c) { +__funline int64x2_t vqdmull_high_lane_s32(int32x4_t __a, int32x2_t __b, + int const __c) { return __builtin_aarch64_sqdmull2_lanev4si(__a, __b, __c); } -FUNK int64x2_t vqdmull_high_laneq_s32(int32x4_t __a, int32x4_t __b, - int const __c) { +__funline int64x2_t vqdmull_high_laneq_s32(int32x4_t __a, int32x4_t __b, + int const __c) { return __builtin_aarch64_sqdmull2_laneqv4si(__a, __b, __c); } -FUNK int64x2_t vqdmull_high_n_s32(int32x4_t __a, int32_t __b) { +__funline int64x2_t vqdmull_high_n_s32(int32x4_t __a, int32_t __b) { return __builtin_aarch64_sqdmull2_nv4si(__a, __b); } -FUNK int64x2_t vqdmull_lane_s32(int32x2_t __a, int32x2_t __b, int const __c) { +__funline int64x2_t vqdmull_lane_s32(int32x2_t __a, int32x2_t __b, + int const __c) { return __builtin_aarch64_sqdmull_lanev2si(__a, __b, __c); } -FUNK int64x2_t vqdmull_laneq_s32(int32x2_t __a, int32x4_t __b, int const __c) { +__funline int64x2_t vqdmull_laneq_s32(int32x2_t __a, int32x4_t __b, + int const __c) { return __builtin_aarch64_sqdmull_laneqv2si(__a, __b, __c); } -FUNK int64x2_t vqdmull_n_s32(int32x2_t __a, int32_t __b) { +__funline int64x2_t vqdmull_n_s32(int32x2_t __a, int32_t __b) { return __builtin_aarch64_sqdmull_nv2si(__a, __b); } -FUNK int32_t vqdmullh_s16(int16_t __a, int16_t __b) { +__funline int32_t vqdmullh_s16(int16_t __a, int16_t __b) { return (int32_t)__builtin_aarch64_sqdmullhi(__a, __b); } -FUNK int32_t vqdmullh_lane_s16(int16_t __a, int16x4_t __b, const int __c) { +__funline int32_t vqdmullh_lane_s16(int16_t __a, int16x4_t __b, const int __c) { return __builtin_aarch64_sqdmull_lanehi(__a, __b, __c); } -FUNK int32_t vqdmullh_laneq_s16(int16_t __a, int16x8_t __b, const int __c) { +__funline int32_t vqdmullh_laneq_s16(int16_t __a, int16x8_t __b, + const int __c) { return __builtin_aarch64_sqdmull_laneqhi(__a, __b, __c); } -FUNK int64_t vqdmulls_s32(int32_t __a, int32_t __b) { +__funline int64_t vqdmulls_s32(int32_t __a, int32_t __b) { return __builtin_aarch64_sqdmullsi(__a, __b); } -FUNK int64_t vqdmulls_lane_s32(int32_t __a, int32x2_t __b, const int __c) { +__funline int64_t vqdmulls_lane_s32(int32_t __a, int32x2_t __b, const int __c) { return __builtin_aarch64_sqdmull_lanesi(__a, __b, __c); } -FUNK int64_t vqdmulls_laneq_s32(int32_t __a, int32x4_t __b, const int __c) { +__funline int64_t vqdmulls_laneq_s32(int32_t __a, int32x4_t __b, + const int __c) { return __builtin_aarch64_sqdmull_laneqsi(__a, __b, __c); } -FUNK int8x8_t vqmovn_s16(int16x8_t __a) { +__funline int8x8_t vqmovn_s16(int16x8_t __a) { return (int8x8_t)__builtin_aarch64_sqmovnv8hi(__a); } -FUNK int16x4_t vqmovn_s32(int32x4_t __a) { +__funline int16x4_t vqmovn_s32(int32x4_t __a) { return (int16x4_t)__builtin_aarch64_sqmovnv4si(__a); } -FUNK int32x2_t vqmovn_s64(int64x2_t __a) { +__funline int32x2_t vqmovn_s64(int64x2_t __a) { return (int32x2_t)__builtin_aarch64_sqmovnv2di(__a); } -FUNK uint8x8_t vqmovn_u16(uint16x8_t __a) { +__funline uint8x8_t vqmovn_u16(uint16x8_t __a) { return (uint8x8_t)__builtin_aarch64_uqmovnv8hi((int16x8_t)__a); } -FUNK uint16x4_t vqmovn_u32(uint32x4_t __a) { +__funline uint16x4_t vqmovn_u32(uint32x4_t __a) { return (uint16x4_t)__builtin_aarch64_uqmovnv4si((int32x4_t)__a); } -FUNK uint32x2_t vqmovn_u64(uint64x2_t __a) { +__funline uint32x2_t vqmovn_u64(uint64x2_t __a) { return (uint32x2_t)__builtin_aarch64_uqmovnv2di((int64x2_t)__a); } -FUNK int8_t vqmovnh_s16(int16_t __a) { +__funline int8_t vqmovnh_s16(int16_t __a) { return (int8_t)__builtin_aarch64_sqmovnhi(__a); } -FUNK int16_t vqmovns_s32(int32_t __a) { +__funline int16_t vqmovns_s32(int32_t __a) { return (int16_t)__builtin_aarch64_sqmovnsi(__a); } -FUNK int32_t vqmovnd_s64(int64_t __a) { +__funline int32_t vqmovnd_s64(int64_t __a) { return (int32_t)__builtin_aarch64_sqmovndi(__a); } -FUNK uint8_t vqmovnh_u16(uint16_t __a) { +__funline uint8_t vqmovnh_u16(uint16_t __a) { return (uint8_t)__builtin_aarch64_uqmovnhi(__a); } -FUNK uint16_t vqmovns_u32(uint32_t __a) { +__funline uint16_t vqmovns_u32(uint32_t __a) { return (uint16_t)__builtin_aarch64_uqmovnsi(__a); } -FUNK uint32_t vqmovnd_u64(uint64_t __a) { +__funline uint32_t vqmovnd_u64(uint64_t __a) { return (uint32_t)__builtin_aarch64_uqmovndi(__a); } -FUNK uint8x8_t vqmovun_s16(int16x8_t __a) { +__funline uint8x8_t vqmovun_s16(int16x8_t __a) { return (uint8x8_t)__builtin_aarch64_sqmovunv8hi(__a); } -FUNK uint16x4_t vqmovun_s32(int32x4_t __a) { +__funline uint16x4_t vqmovun_s32(int32x4_t __a) { return (uint16x4_t)__builtin_aarch64_sqmovunv4si(__a); } -FUNK uint32x2_t vqmovun_s64(int64x2_t __a) { +__funline uint32x2_t vqmovun_s64(int64x2_t __a) { return (uint32x2_t)__builtin_aarch64_sqmovunv2di(__a); } -FUNK int8_t vqmovunh_s16(int16_t __a) { +__funline int8_t vqmovunh_s16(int16_t __a) { return (int8_t)__builtin_aarch64_sqmovunhi(__a); } -FUNK int16_t vqmovuns_s32(int32_t __a) { +__funline int16_t vqmovuns_s32(int32_t __a) { return (int16_t)__builtin_aarch64_sqmovunsi(__a); } -FUNK int32_t vqmovund_s64(int64_t __a) { +__funline int32_t vqmovund_s64(int64_t __a) { return (int32_t)__builtin_aarch64_sqmovundi(__a); } -FUNK int64x2_t vqnegq_s64(int64x2_t __a) { +__funline int64x2_t vqnegq_s64(int64x2_t __a) { return (int64x2_t)__builtin_aarch64_sqnegv2di(__a); } -FUNK int8_t vqnegb_s8(int8_t __a) { +__funline int8_t vqnegb_s8(int8_t __a) { return (int8_t)__builtin_aarch64_sqnegqi(__a); } -FUNK int16_t vqnegh_s16(int16_t __a) { +__funline int16_t vqnegh_s16(int16_t __a) { return (int16_t)__builtin_aarch64_sqneghi(__a); } -FUNK int32_t vqnegs_s32(int32_t __a) { +__funline int32_t vqnegs_s32(int32_t __a) { return (int32_t)__builtin_aarch64_sqnegsi(__a); } -FUNK int64_t vqnegd_s64(int64_t __a) { +__funline int64_t vqnegd_s64(int64_t __a) { return __builtin_aarch64_sqnegdi(__a); } -FUNK int16x4_t vqrdmulh_lane_s16(int16x4_t __a, int16x4_t __b, const int __c) { +__funline int16x4_t vqrdmulh_lane_s16(int16x4_t __a, int16x4_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_lanev4hi(__a, __b, __c); } -FUNK int32x2_t vqrdmulh_lane_s32(int32x2_t __a, int32x2_t __b, const int __c) { +__funline int32x2_t vqrdmulh_lane_s32(int32x2_t __a, int32x2_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_lanev2si(__a, __b, __c); } -FUNK int16x8_t vqrdmulhq_lane_s16(int16x8_t __a, int16x4_t __b, const int __c) { +__funline int16x8_t vqrdmulhq_lane_s16(int16x8_t __a, int16x4_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_lanev8hi(__a, __b, __c); } -FUNK int32x4_t vqrdmulhq_lane_s32(int32x4_t __a, int32x2_t __b, const int __c) { +__funline int32x4_t vqrdmulhq_lane_s32(int32x4_t __a, int32x2_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_lanev4si(__a, __b, __c); } -FUNK int16_t vqrdmulhh_s16(int16_t __a, int16_t __b) { +__funline int16_t vqrdmulhh_s16(int16_t __a, int16_t __b) { return (int16_t)__builtin_aarch64_sqrdmulhhi(__a, __b); } -FUNK int16_t vqrdmulhh_lane_s16(int16_t __a, int16x4_t __b, const int __c) { +__funline int16_t vqrdmulhh_lane_s16(int16_t __a, int16x4_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_lanehi(__a, __b, __c); } -FUNK int16_t vqrdmulhh_laneq_s16(int16_t __a, int16x8_t __b, const int __c) { +__funline int16_t vqrdmulhh_laneq_s16(int16_t __a, int16x8_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_laneqhi(__a, __b, __c); } -FUNK int32_t vqrdmulhs_s32(int32_t __a, int32_t __b) { +__funline int32_t vqrdmulhs_s32(int32_t __a, int32_t __b) { return (int32_t)__builtin_aarch64_sqrdmulhsi(__a, __b); } -FUNK int32_t vqrdmulhs_lane_s32(int32_t __a, int32x2_t __b, const int __c) { +__funline int32_t vqrdmulhs_lane_s32(int32_t __a, int32x2_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_lanesi(__a, __b, __c); } -FUNK int32_t vqrdmulhs_laneq_s32(int32_t __a, int32x4_t __b, const int __c) { +__funline int32_t vqrdmulhs_laneq_s32(int32_t __a, int32x4_t __b, + const int __c) { return __builtin_aarch64_sqrdmulh_laneqsi(__a, __b, __c); } -FUNK int8x8_t vqrshl_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vqrshl_s8(int8x8_t __a, int8x8_t __b) { return __builtin_aarch64_sqrshlv8qi(__a, __b); } -FUNK int16x4_t vqrshl_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vqrshl_s16(int16x4_t __a, int16x4_t __b) { return __builtin_aarch64_sqrshlv4hi(__a, __b); } -FUNK int32x2_t vqrshl_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vqrshl_s32(int32x2_t __a, int32x2_t __b) { return __builtin_aarch64_sqrshlv2si(__a, __b); } -FUNK int64x1_t vqrshl_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vqrshl_s64(int64x1_t __a, int64x1_t __b) { return (int64x1_t){__builtin_aarch64_sqrshldi(__a[0], __b[0])}; } -FUNK uint8x8_t vqrshl_u8(uint8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vqrshl_u8(uint8x8_t __a, int8x8_t __b) { return __builtin_aarch64_uqrshlv8qi_uus(__a, __b); } -FUNK uint16x4_t vqrshl_u16(uint16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vqrshl_u16(uint16x4_t __a, int16x4_t __b) { return __builtin_aarch64_uqrshlv4hi_uus(__a, __b); } -FUNK uint32x2_t vqrshl_u32(uint32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vqrshl_u32(uint32x2_t __a, int32x2_t __b) { return __builtin_aarch64_uqrshlv2si_uus(__a, __b); } -FUNK uint64x1_t vqrshl_u64(uint64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vqrshl_u64(uint64x1_t __a, int64x1_t __b) { return (uint64x1_t){__builtin_aarch64_uqrshldi_uus(__a[0], __b[0])}; } -FUNK int8x16_t vqrshlq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vqrshlq_s8(int8x16_t __a, int8x16_t __b) { return __builtin_aarch64_sqrshlv16qi(__a, __b); } -FUNK int16x8_t vqrshlq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vqrshlq_s16(int16x8_t __a, int16x8_t __b) { return __builtin_aarch64_sqrshlv8hi(__a, __b); } -FUNK int32x4_t vqrshlq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vqrshlq_s32(int32x4_t __a, int32x4_t __b) { return __builtin_aarch64_sqrshlv4si(__a, __b); } -FUNK int64x2_t vqrshlq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vqrshlq_s64(int64x2_t __a, int64x2_t __b) { return __builtin_aarch64_sqrshlv2di(__a, __b); } -FUNK uint8x16_t vqrshlq_u8(uint8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vqrshlq_u8(uint8x16_t __a, int8x16_t __b) { return __builtin_aarch64_uqrshlv16qi_uus(__a, __b); } -FUNK uint16x8_t vqrshlq_u16(uint16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vqrshlq_u16(uint16x8_t __a, int16x8_t __b) { return __builtin_aarch64_uqrshlv8hi_uus(__a, __b); } -FUNK uint32x4_t vqrshlq_u32(uint32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vqrshlq_u32(uint32x4_t __a, int32x4_t __b) { return __builtin_aarch64_uqrshlv4si_uus(__a, __b); } -FUNK uint64x2_t vqrshlq_u64(uint64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vqrshlq_u64(uint64x2_t __a, int64x2_t __b) { return __builtin_aarch64_uqrshlv2di_uus(__a, __b); } -FUNK int8_t vqrshlb_s8(int8_t __a, int8_t __b) { +__funline int8_t vqrshlb_s8(int8_t __a, int8_t __b) { return __builtin_aarch64_sqrshlqi(__a, __b); } -FUNK int16_t vqrshlh_s16(int16_t __a, int16_t __b) { +__funline int16_t vqrshlh_s16(int16_t __a, int16_t __b) { return __builtin_aarch64_sqrshlhi(__a, __b); } -FUNK int32_t vqrshls_s32(int32_t __a, int32_t __b) { +__funline int32_t vqrshls_s32(int32_t __a, int32_t __b) { return __builtin_aarch64_sqrshlsi(__a, __b); } -FUNK int64_t vqrshld_s64(int64_t __a, int64_t __b) { +__funline int64_t vqrshld_s64(int64_t __a, int64_t __b) { return __builtin_aarch64_sqrshldi(__a, __b); } -FUNK uint8_t vqrshlb_u8(uint8_t __a, uint8_t __b) { +__funline uint8_t vqrshlb_u8(uint8_t __a, uint8_t __b) { return __builtin_aarch64_uqrshlqi_uus(__a, __b); } -FUNK uint16_t vqrshlh_u16(uint16_t __a, uint16_t __b) { +__funline uint16_t vqrshlh_u16(uint16_t __a, uint16_t __b) { return __builtin_aarch64_uqrshlhi_uus(__a, __b); } -FUNK uint32_t vqrshls_u32(uint32_t __a, uint32_t __b) { +__funline uint32_t vqrshls_u32(uint32_t __a, uint32_t __b) { return __builtin_aarch64_uqrshlsi_uus(__a, __b); } -FUNK uint64_t vqrshld_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vqrshld_u64(uint64_t __a, uint64_t __b) { return __builtin_aarch64_uqrshldi_uus(__a, __b); } -FUNK int8x8_t vqrshrn_n_s16(int16x8_t __a, const int __b) { +__funline int8x8_t vqrshrn_n_s16(int16x8_t __a, const int __b) { return (int8x8_t)__builtin_aarch64_sqrshrn_nv8hi(__a, __b); } -FUNK int16x4_t vqrshrn_n_s32(int32x4_t __a, const int __b) { +__funline int16x4_t vqrshrn_n_s32(int32x4_t __a, const int __b) { return (int16x4_t)__builtin_aarch64_sqrshrn_nv4si(__a, __b); } -FUNK int32x2_t vqrshrn_n_s64(int64x2_t __a, const int __b) { +__funline int32x2_t vqrshrn_n_s64(int64x2_t __a, const int __b) { return (int32x2_t)__builtin_aarch64_sqrshrn_nv2di(__a, __b); } -FUNK uint8x8_t vqrshrn_n_u16(uint16x8_t __a, const int __b) { +__funline uint8x8_t vqrshrn_n_u16(uint16x8_t __a, const int __b) { return __builtin_aarch64_uqrshrn_nv8hi_uus(__a, __b); } -FUNK uint16x4_t vqrshrn_n_u32(uint32x4_t __a, const int __b) { +__funline uint16x4_t vqrshrn_n_u32(uint32x4_t __a, const int __b) { return __builtin_aarch64_uqrshrn_nv4si_uus(__a, __b); } -FUNK uint32x2_t vqrshrn_n_u64(uint64x2_t __a, const int __b) { +__funline uint32x2_t vqrshrn_n_u64(uint64x2_t __a, const int __b) { return __builtin_aarch64_uqrshrn_nv2di_uus(__a, __b); } -FUNK int8_t vqrshrnh_n_s16(int16_t __a, const int __b) { +__funline int8_t vqrshrnh_n_s16(int16_t __a, const int __b) { return (int8_t)__builtin_aarch64_sqrshrn_nhi(__a, __b); } -FUNK int16_t vqrshrns_n_s32(int32_t __a, const int __b) { +__funline int16_t vqrshrns_n_s32(int32_t __a, const int __b) { return (int16_t)__builtin_aarch64_sqrshrn_nsi(__a, __b); } -FUNK int32_t vqrshrnd_n_s64(int64_t __a, const int __b) { +__funline int32_t vqrshrnd_n_s64(int64_t __a, const int __b) { return (int32_t)__builtin_aarch64_sqrshrn_ndi(__a, __b); } -FUNK uint8_t vqrshrnh_n_u16(uint16_t __a, const int __b) { +__funline uint8_t vqrshrnh_n_u16(uint16_t __a, const int __b) { return __builtin_aarch64_uqrshrn_nhi_uus(__a, __b); } -FUNK uint16_t vqrshrns_n_u32(uint32_t __a, const int __b) { +__funline uint16_t vqrshrns_n_u32(uint32_t __a, const int __b) { return __builtin_aarch64_uqrshrn_nsi_uus(__a, __b); } -FUNK uint32_t vqrshrnd_n_u64(uint64_t __a, const int __b) { +__funline uint32_t vqrshrnd_n_u64(uint64_t __a, const int __b) { return __builtin_aarch64_uqrshrn_ndi_uus(__a, __b); } -FUNK uint8x8_t vqrshrun_n_s16(int16x8_t __a, const int __b) { +__funline uint8x8_t vqrshrun_n_s16(int16x8_t __a, const int __b) { return (uint8x8_t)__builtin_aarch64_sqrshrun_nv8hi(__a, __b); } -FUNK uint16x4_t vqrshrun_n_s32(int32x4_t __a, const int __b) { +__funline uint16x4_t vqrshrun_n_s32(int32x4_t __a, const int __b) { return (uint16x4_t)__builtin_aarch64_sqrshrun_nv4si(__a, __b); } -FUNK uint32x2_t vqrshrun_n_s64(int64x2_t __a, const int __b) { +__funline uint32x2_t vqrshrun_n_s64(int64x2_t __a, const int __b) { return (uint32x2_t)__builtin_aarch64_sqrshrun_nv2di(__a, __b); } -FUNK int8_t vqrshrunh_n_s16(int16_t __a, const int __b) { +__funline int8_t vqrshrunh_n_s16(int16_t __a, const int __b) { return (int8_t)__builtin_aarch64_sqrshrun_nhi(__a, __b); } -FUNK int16_t vqrshruns_n_s32(int32_t __a, const int __b) { +__funline int16_t vqrshruns_n_s32(int32_t __a, const int __b) { return (int16_t)__builtin_aarch64_sqrshrun_nsi(__a, __b); } -FUNK int32_t vqrshrund_n_s64(int64_t __a, const int __b) { +__funline int32_t vqrshrund_n_s64(int64_t __a, const int __b) { return (int32_t)__builtin_aarch64_sqrshrun_ndi(__a, __b); } -FUNK int8x8_t vqshl_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vqshl_s8(int8x8_t __a, int8x8_t __b) { return __builtin_aarch64_sqshlv8qi(__a, __b); } -FUNK int16x4_t vqshl_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vqshl_s16(int16x4_t __a, int16x4_t __b) { return __builtin_aarch64_sqshlv4hi(__a, __b); } -FUNK int32x2_t vqshl_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vqshl_s32(int32x2_t __a, int32x2_t __b) { return __builtin_aarch64_sqshlv2si(__a, __b); } -FUNK int64x1_t vqshl_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vqshl_s64(int64x1_t __a, int64x1_t __b) { return (int64x1_t){__builtin_aarch64_sqshldi(__a[0], __b[0])}; } -FUNK uint8x8_t vqshl_u8(uint8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vqshl_u8(uint8x8_t __a, int8x8_t __b) { return __builtin_aarch64_uqshlv8qi_uus(__a, __b); } -FUNK uint16x4_t vqshl_u16(uint16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vqshl_u16(uint16x4_t __a, int16x4_t __b) { return __builtin_aarch64_uqshlv4hi_uus(__a, __b); } -FUNK uint32x2_t vqshl_u32(uint32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vqshl_u32(uint32x2_t __a, int32x2_t __b) { return __builtin_aarch64_uqshlv2si_uus(__a, __b); } -FUNK uint64x1_t vqshl_u64(uint64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vqshl_u64(uint64x1_t __a, int64x1_t __b) { return (uint64x1_t){__builtin_aarch64_uqshldi_uus(__a[0], __b[0])}; } -FUNK int8x16_t vqshlq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vqshlq_s8(int8x16_t __a, int8x16_t __b) { return __builtin_aarch64_sqshlv16qi(__a, __b); } -FUNK int16x8_t vqshlq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vqshlq_s16(int16x8_t __a, int16x8_t __b) { return __builtin_aarch64_sqshlv8hi(__a, __b); } -FUNK int32x4_t vqshlq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vqshlq_s32(int32x4_t __a, int32x4_t __b) { return __builtin_aarch64_sqshlv4si(__a, __b); } -FUNK int64x2_t vqshlq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vqshlq_s64(int64x2_t __a, int64x2_t __b) { return __builtin_aarch64_sqshlv2di(__a, __b); } -FUNK uint8x16_t vqshlq_u8(uint8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vqshlq_u8(uint8x16_t __a, int8x16_t __b) { return __builtin_aarch64_uqshlv16qi_uus(__a, __b); } -FUNK uint16x8_t vqshlq_u16(uint16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vqshlq_u16(uint16x8_t __a, int16x8_t __b) { return __builtin_aarch64_uqshlv8hi_uus(__a, __b); } -FUNK uint32x4_t vqshlq_u32(uint32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vqshlq_u32(uint32x4_t __a, int32x4_t __b) { return __builtin_aarch64_uqshlv4si_uus(__a, __b); } -FUNK uint64x2_t vqshlq_u64(uint64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vqshlq_u64(uint64x2_t __a, int64x2_t __b) { return __builtin_aarch64_uqshlv2di_uus(__a, __b); } -FUNK int8_t vqshlb_s8(int8_t __a, int8_t __b) { +__funline int8_t vqshlb_s8(int8_t __a, int8_t __b) { return __builtin_aarch64_sqshlqi(__a, __b); } -FUNK int16_t vqshlh_s16(int16_t __a, int16_t __b) { +__funline int16_t vqshlh_s16(int16_t __a, int16_t __b) { return __builtin_aarch64_sqshlhi(__a, __b); } -FUNK int32_t vqshls_s32(int32_t __a, int32_t __b) { +__funline int32_t vqshls_s32(int32_t __a, int32_t __b) { return __builtin_aarch64_sqshlsi(__a, __b); } -FUNK int64_t vqshld_s64(int64_t __a, int64_t __b) { +__funline int64_t vqshld_s64(int64_t __a, int64_t __b) { return __builtin_aarch64_sqshldi(__a, __b); } -FUNK uint8_t vqshlb_u8(uint8_t __a, uint8_t __b) { +__funline uint8_t vqshlb_u8(uint8_t __a, uint8_t __b) { return __builtin_aarch64_uqshlqi_uus(__a, __b); } -FUNK uint16_t vqshlh_u16(uint16_t __a, uint16_t __b) { +__funline uint16_t vqshlh_u16(uint16_t __a, uint16_t __b) { return __builtin_aarch64_uqshlhi_uus(__a, __b); } -FUNK uint32_t vqshls_u32(uint32_t __a, uint32_t __b) { +__funline uint32_t vqshls_u32(uint32_t __a, uint32_t __b) { return __builtin_aarch64_uqshlsi_uus(__a, __b); } -FUNK uint64_t vqshld_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vqshld_u64(uint64_t __a, uint64_t __b) { return __builtin_aarch64_uqshldi_uus(__a, __b); } -FUNK int8x8_t vqshl_n_s8(int8x8_t __a, const int __b) { +__funline int8x8_t vqshl_n_s8(int8x8_t __a, const int __b) { return (int8x8_t)__builtin_aarch64_sqshl_nv8qi(__a, __b); } -FUNK int16x4_t vqshl_n_s16(int16x4_t __a, const int __b) { +__funline int16x4_t vqshl_n_s16(int16x4_t __a, const int __b) { return (int16x4_t)__builtin_aarch64_sqshl_nv4hi(__a, __b); } -FUNK int32x2_t vqshl_n_s32(int32x2_t __a, const int __b) { +__funline int32x2_t vqshl_n_s32(int32x2_t __a, const int __b) { return (int32x2_t)__builtin_aarch64_sqshl_nv2si(__a, __b); } -FUNK int64x1_t vqshl_n_s64(int64x1_t __a, const int __b) { +__funline int64x1_t vqshl_n_s64(int64x1_t __a, const int __b) { return (int64x1_t){__builtin_aarch64_sqshl_ndi(__a[0], __b)}; } -FUNK uint8x8_t vqshl_n_u8(uint8x8_t __a, const int __b) { +__funline uint8x8_t vqshl_n_u8(uint8x8_t __a, const int __b) { return __builtin_aarch64_uqshl_nv8qi_uus(__a, __b); } -FUNK uint16x4_t vqshl_n_u16(uint16x4_t __a, const int __b) { +__funline uint16x4_t vqshl_n_u16(uint16x4_t __a, const int __b) { return __builtin_aarch64_uqshl_nv4hi_uus(__a, __b); } -FUNK uint32x2_t vqshl_n_u32(uint32x2_t __a, const int __b) { +__funline uint32x2_t vqshl_n_u32(uint32x2_t __a, const int __b) { return __builtin_aarch64_uqshl_nv2si_uus(__a, __b); } -FUNK uint64x1_t vqshl_n_u64(uint64x1_t __a, const int __b) { +__funline uint64x1_t vqshl_n_u64(uint64x1_t __a, const int __b) { return (uint64x1_t){__builtin_aarch64_uqshl_ndi_uus(__a[0], __b)}; } -FUNK int8x16_t vqshlq_n_s8(int8x16_t __a, const int __b) { +__funline int8x16_t vqshlq_n_s8(int8x16_t __a, const int __b) { return (int8x16_t)__builtin_aarch64_sqshl_nv16qi(__a, __b); } -FUNK int16x8_t vqshlq_n_s16(int16x8_t __a, const int __b) { +__funline int16x8_t vqshlq_n_s16(int16x8_t __a, const int __b) { return (int16x8_t)__builtin_aarch64_sqshl_nv8hi(__a, __b); } -FUNK int32x4_t vqshlq_n_s32(int32x4_t __a, const int __b) { +__funline int32x4_t vqshlq_n_s32(int32x4_t __a, const int __b) { return (int32x4_t)__builtin_aarch64_sqshl_nv4si(__a, __b); } -FUNK int64x2_t vqshlq_n_s64(int64x2_t __a, const int __b) { +__funline int64x2_t vqshlq_n_s64(int64x2_t __a, const int __b) { return (int64x2_t)__builtin_aarch64_sqshl_nv2di(__a, __b); } -FUNK uint8x16_t vqshlq_n_u8(uint8x16_t __a, const int __b) { +__funline uint8x16_t vqshlq_n_u8(uint8x16_t __a, const int __b) { return __builtin_aarch64_uqshl_nv16qi_uus(__a, __b); } -FUNK uint16x8_t vqshlq_n_u16(uint16x8_t __a, const int __b) { +__funline uint16x8_t vqshlq_n_u16(uint16x8_t __a, const int __b) { return __builtin_aarch64_uqshl_nv8hi_uus(__a, __b); } -FUNK uint32x4_t vqshlq_n_u32(uint32x4_t __a, const int __b) { +__funline uint32x4_t vqshlq_n_u32(uint32x4_t __a, const int __b) { return __builtin_aarch64_uqshl_nv4si_uus(__a, __b); } -FUNK uint64x2_t vqshlq_n_u64(uint64x2_t __a, const int __b) { +__funline uint64x2_t vqshlq_n_u64(uint64x2_t __a, const int __b) { return __builtin_aarch64_uqshl_nv2di_uus(__a, __b); } -FUNK int8_t vqshlb_n_s8(int8_t __a, const int __b) { +__funline int8_t vqshlb_n_s8(int8_t __a, const int __b) { return (int8_t)__builtin_aarch64_sqshl_nqi(__a, __b); } -FUNK int16_t vqshlh_n_s16(int16_t __a, const int __b) { +__funline int16_t vqshlh_n_s16(int16_t __a, const int __b) { return (int16_t)__builtin_aarch64_sqshl_nhi(__a, __b); } -FUNK int32_t vqshls_n_s32(int32_t __a, const int __b) { +__funline int32_t vqshls_n_s32(int32_t __a, const int __b) { return (int32_t)__builtin_aarch64_sqshl_nsi(__a, __b); } -FUNK int64_t vqshld_n_s64(int64_t __a, const int __b) { +__funline int64_t vqshld_n_s64(int64_t __a, const int __b) { return __builtin_aarch64_sqshl_ndi(__a, __b); } -FUNK uint8_t vqshlb_n_u8(uint8_t __a, const int __b) { +__funline uint8_t vqshlb_n_u8(uint8_t __a, const int __b) { return __builtin_aarch64_uqshl_nqi_uus(__a, __b); } -FUNK uint16_t vqshlh_n_u16(uint16_t __a, const int __b) { +__funline uint16_t vqshlh_n_u16(uint16_t __a, const int __b) { return __builtin_aarch64_uqshl_nhi_uus(__a, __b); } -FUNK uint32_t vqshls_n_u32(uint32_t __a, const int __b) { +__funline uint32_t vqshls_n_u32(uint32_t __a, const int __b) { return __builtin_aarch64_uqshl_nsi_uus(__a, __b); } -FUNK uint64_t vqshld_n_u64(uint64_t __a, const int __b) { +__funline uint64_t vqshld_n_u64(uint64_t __a, const int __b) { return __builtin_aarch64_uqshl_ndi_uus(__a, __b); } -FUNK uint8x8_t vqshlu_n_s8(int8x8_t __a, const int __b) { +__funline uint8x8_t vqshlu_n_s8(int8x8_t __a, const int __b) { return __builtin_aarch64_sqshlu_nv8qi_uss(__a, __b); } -FUNK uint16x4_t vqshlu_n_s16(int16x4_t __a, const int __b) { +__funline uint16x4_t vqshlu_n_s16(int16x4_t __a, const int __b) { return __builtin_aarch64_sqshlu_nv4hi_uss(__a, __b); } -FUNK uint32x2_t vqshlu_n_s32(int32x2_t __a, const int __b) { +__funline uint32x2_t vqshlu_n_s32(int32x2_t __a, const int __b) { return __builtin_aarch64_sqshlu_nv2si_uss(__a, __b); } -FUNK uint64x1_t vqshlu_n_s64(int64x1_t __a, const int __b) { +__funline uint64x1_t vqshlu_n_s64(int64x1_t __a, const int __b) { return (uint64x1_t){__builtin_aarch64_sqshlu_ndi_uss(__a[0], __b)}; } -FUNK uint8x16_t vqshluq_n_s8(int8x16_t __a, const int __b) { +__funline uint8x16_t vqshluq_n_s8(int8x16_t __a, const int __b) { return __builtin_aarch64_sqshlu_nv16qi_uss(__a, __b); } -FUNK uint16x8_t vqshluq_n_s16(int16x8_t __a, const int __b) { +__funline uint16x8_t vqshluq_n_s16(int16x8_t __a, const int __b) { return __builtin_aarch64_sqshlu_nv8hi_uss(__a, __b); } -FUNK uint32x4_t vqshluq_n_s32(int32x4_t __a, const int __b) { +__funline uint32x4_t vqshluq_n_s32(int32x4_t __a, const int __b) { return __builtin_aarch64_sqshlu_nv4si_uss(__a, __b); } -FUNK uint64x2_t vqshluq_n_s64(int64x2_t __a, const int __b) { +__funline uint64x2_t vqshluq_n_s64(int64x2_t __a, const int __b) { return __builtin_aarch64_sqshlu_nv2di_uss(__a, __b); } -FUNK int8_t vqshlub_n_s8(int8_t __a, const int __b) { +__funline int8_t vqshlub_n_s8(int8_t __a, const int __b) { return (int8_t)__builtin_aarch64_sqshlu_nqi_uss(__a, __b); } -FUNK int16_t vqshluh_n_s16(int16_t __a, const int __b) { +__funline int16_t vqshluh_n_s16(int16_t __a, const int __b) { return (int16_t)__builtin_aarch64_sqshlu_nhi_uss(__a, __b); } -FUNK int32_t vqshlus_n_s32(int32_t __a, const int __b) { +__funline int32_t vqshlus_n_s32(int32_t __a, const int __b) { return (int32_t)__builtin_aarch64_sqshlu_nsi_uss(__a, __b); } -FUNK uint64_t vqshlud_n_s64(int64_t __a, const int __b) { +__funline uint64_t vqshlud_n_s64(int64_t __a, const int __b) { return __builtin_aarch64_sqshlu_ndi_uss(__a, __b); } -FUNK int8x8_t vqshrn_n_s16(int16x8_t __a, const int __b) { +__funline int8x8_t vqshrn_n_s16(int16x8_t __a, const int __b) { return (int8x8_t)__builtin_aarch64_sqshrn_nv8hi(__a, __b); } -FUNK int16x4_t vqshrn_n_s32(int32x4_t __a, const int __b) { +__funline int16x4_t vqshrn_n_s32(int32x4_t __a, const int __b) { return (int16x4_t)__builtin_aarch64_sqshrn_nv4si(__a, __b); } -FUNK int32x2_t vqshrn_n_s64(int64x2_t __a, const int __b) { +__funline int32x2_t vqshrn_n_s64(int64x2_t __a, const int __b) { return (int32x2_t)__builtin_aarch64_sqshrn_nv2di(__a, __b); } -FUNK uint8x8_t vqshrn_n_u16(uint16x8_t __a, const int __b) { +__funline uint8x8_t vqshrn_n_u16(uint16x8_t __a, const int __b) { return __builtin_aarch64_uqshrn_nv8hi_uus(__a, __b); } -FUNK uint16x4_t vqshrn_n_u32(uint32x4_t __a, const int __b) { +__funline uint16x4_t vqshrn_n_u32(uint32x4_t __a, const int __b) { return __builtin_aarch64_uqshrn_nv4si_uus(__a, __b); } -FUNK uint32x2_t vqshrn_n_u64(uint64x2_t __a, const int __b) { +__funline uint32x2_t vqshrn_n_u64(uint64x2_t __a, const int __b) { return __builtin_aarch64_uqshrn_nv2di_uus(__a, __b); } -FUNK int8_t vqshrnh_n_s16(int16_t __a, const int __b) { +__funline int8_t vqshrnh_n_s16(int16_t __a, const int __b) { return (int8_t)__builtin_aarch64_sqshrn_nhi(__a, __b); } -FUNK int16_t vqshrns_n_s32(int32_t __a, const int __b) { +__funline int16_t vqshrns_n_s32(int32_t __a, const int __b) { return (int16_t)__builtin_aarch64_sqshrn_nsi(__a, __b); } -FUNK int32_t vqshrnd_n_s64(int64_t __a, const int __b) { +__funline int32_t vqshrnd_n_s64(int64_t __a, const int __b) { return (int32_t)__builtin_aarch64_sqshrn_ndi(__a, __b); } -FUNK uint8_t vqshrnh_n_u16(uint16_t __a, const int __b) { +__funline uint8_t vqshrnh_n_u16(uint16_t __a, const int __b) { return __builtin_aarch64_uqshrn_nhi_uus(__a, __b); } -FUNK uint16_t vqshrns_n_u32(uint32_t __a, const int __b) { +__funline uint16_t vqshrns_n_u32(uint32_t __a, const int __b) { return __builtin_aarch64_uqshrn_nsi_uus(__a, __b); } -FUNK uint32_t vqshrnd_n_u64(uint64_t __a, const int __b) { +__funline uint32_t vqshrnd_n_u64(uint64_t __a, const int __b) { return __builtin_aarch64_uqshrn_ndi_uus(__a, __b); } -FUNK uint8x8_t vqshrun_n_s16(int16x8_t __a, const int __b) { +__funline uint8x8_t vqshrun_n_s16(int16x8_t __a, const int __b) { return (uint8x8_t)__builtin_aarch64_sqshrun_nv8hi(__a, __b); } -FUNK uint16x4_t vqshrun_n_s32(int32x4_t __a, const int __b) { +__funline uint16x4_t vqshrun_n_s32(int32x4_t __a, const int __b) { return (uint16x4_t)__builtin_aarch64_sqshrun_nv4si(__a, __b); } -FUNK uint32x2_t vqshrun_n_s64(int64x2_t __a, const int __b) { +__funline uint32x2_t vqshrun_n_s64(int64x2_t __a, const int __b) { return (uint32x2_t)__builtin_aarch64_sqshrun_nv2di(__a, __b); } -FUNK int8_t vqshrunh_n_s16(int16_t __a, const int __b) { +__funline int8_t vqshrunh_n_s16(int16_t __a, const int __b) { return (int8_t)__builtin_aarch64_sqshrun_nhi(__a, __b); } -FUNK int16_t vqshruns_n_s32(int32_t __a, const int __b) { +__funline int16_t vqshruns_n_s32(int32_t __a, const int __b) { return (int16_t)__builtin_aarch64_sqshrun_nsi(__a, __b); } -FUNK int32_t vqshrund_n_s64(int64_t __a, const int __b) { +__funline int32_t vqshrund_n_s64(int64_t __a, const int __b) { return (int32_t)__builtin_aarch64_sqshrun_ndi(__a, __b); } -FUNK int8_t vqsubb_s8(int8_t __a, int8_t __b) { +__funline int8_t vqsubb_s8(int8_t __a, int8_t __b) { return (int8_t)__builtin_aarch64_sqsubqi(__a, __b); } -FUNK int16_t vqsubh_s16(int16_t __a, int16_t __b) { +__funline int16_t vqsubh_s16(int16_t __a, int16_t __b) { return (int16_t)__builtin_aarch64_sqsubhi(__a, __b); } -FUNK int32_t vqsubs_s32(int32_t __a, int32_t __b) { +__funline int32_t vqsubs_s32(int32_t __a, int32_t __b) { return (int32_t)__builtin_aarch64_sqsubsi(__a, __b); } -FUNK int64_t vqsubd_s64(int64_t __a, int64_t __b) { +__funline int64_t vqsubd_s64(int64_t __a, int64_t __b) { return __builtin_aarch64_sqsubdi(__a, __b); } -FUNK uint8_t vqsubb_u8(uint8_t __a, uint8_t __b) { +__funline uint8_t vqsubb_u8(uint8_t __a, uint8_t __b) { return (uint8_t)__builtin_aarch64_uqsubqi_uuu(__a, __b); } -FUNK uint16_t vqsubh_u16(uint16_t __a, uint16_t __b) { +__funline uint16_t vqsubh_u16(uint16_t __a, uint16_t __b) { return (uint16_t)__builtin_aarch64_uqsubhi_uuu(__a, __b); } -FUNK uint32_t vqsubs_u32(uint32_t __a, uint32_t __b) { +__funline uint32_t vqsubs_u32(uint32_t __a, uint32_t __b) { return (uint32_t)__builtin_aarch64_uqsubsi_uuu(__a, __b); } -FUNK uint64_t vqsubd_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vqsubd_u64(uint64_t __a, uint64_t __b) { return __builtin_aarch64_uqsubdi_uuu(__a, __b); } -FUNK int8x8_t vqtbl2_s8(int8x16x2_t tab, uint8x8_t idx) { +__funline int8x8_t vqtbl2_s8(int8x16x2_t tab, uint8x8_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, tab.val[1], 1); return __builtin_aarch64_tbl3v8qi(__o, (int8x8_t)idx); } -FUNK uint8x8_t vqtbl2_u8(uint8x16x2_t tab, uint8x8_t idx) { +__funline uint8x8_t vqtbl2_u8(uint8x16x2_t tab, uint8x8_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[1], 1); return (uint8x8_t)__builtin_aarch64_tbl3v8qi(__o, (int8x8_t)idx); } -FUNK poly8x8_t vqtbl2_p8(poly8x16x2_t tab, uint8x8_t idx) { +__funline poly8x8_t vqtbl2_p8(poly8x16x2_t tab, uint8x8_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[1], 1); return (poly8x8_t)__builtin_aarch64_tbl3v8qi(__o, (int8x8_t)idx); } -FUNK int8x16_t vqtbl2q_s8(int8x16x2_t tab, uint8x16_t idx) { +__funline int8x16_t vqtbl2q_s8(int8x16x2_t tab, uint8x16_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[1], 1); return __builtin_aarch64_tbl3v16qi(__o, (int8x16_t)idx); } -FUNK uint8x16_t vqtbl2q_u8(uint8x16x2_t tab, uint8x16_t idx) { +__funline uint8x16_t vqtbl2q_u8(uint8x16x2_t tab, uint8x16_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[1], 1); return (uint8x16_t)__builtin_aarch64_tbl3v16qi(__o, (int8x16_t)idx); } -FUNK poly8x16_t vqtbl2q_p8(poly8x16x2_t tab, uint8x16_t idx) { +__funline poly8x16_t vqtbl2q_p8(poly8x16x2_t tab, uint8x16_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[1], 1); return (poly8x16_t)__builtin_aarch64_tbl3v16qi(__o, (int8x16_t)idx); } -FUNK int8x8_t vqtbl3_s8(int8x16x3_t tab, uint8x8_t idx) { +__funline int8x8_t vqtbl3_s8(int8x16x3_t tab, uint8x8_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16189,7 +16277,7 @@ FUNK int8x8_t vqtbl3_s8(int8x16x3_t tab, uint8x8_t idx) { return __builtin_aarch64_qtbl3v8qi(__o, (int8x8_t)idx); } -FUNK uint8x8_t vqtbl3_u8(uint8x16x3_t tab, uint8x8_t idx) { +__funline uint8x8_t vqtbl3_u8(uint8x16x3_t tab, uint8x8_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16197,7 +16285,7 @@ FUNK uint8x8_t vqtbl3_u8(uint8x16x3_t tab, uint8x8_t idx) { return (uint8x8_t)__builtin_aarch64_qtbl3v8qi(__o, (int8x8_t)idx); } -FUNK poly8x8_t vqtbl3_p8(poly8x16x3_t tab, uint8x8_t idx) { +__funline poly8x8_t vqtbl3_p8(poly8x16x3_t tab, uint8x8_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16205,7 +16293,7 @@ FUNK poly8x8_t vqtbl3_p8(poly8x16x3_t tab, uint8x8_t idx) { return (poly8x8_t)__builtin_aarch64_qtbl3v8qi(__o, (int8x8_t)idx); } -FUNK int8x16_t vqtbl3q_s8(int8x16x3_t tab, uint8x16_t idx) { +__funline int8x16_t vqtbl3q_s8(int8x16x3_t tab, uint8x16_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16213,7 +16301,7 @@ FUNK int8x16_t vqtbl3q_s8(int8x16x3_t tab, uint8x16_t idx) { return __builtin_aarch64_qtbl3v16qi(__o, (int8x16_t)idx); } -FUNK uint8x16_t vqtbl3q_u8(uint8x16x3_t tab, uint8x16_t idx) { +__funline uint8x16_t vqtbl3q_u8(uint8x16x3_t tab, uint8x16_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16221,7 +16309,7 @@ FUNK uint8x16_t vqtbl3q_u8(uint8x16x3_t tab, uint8x16_t idx) { return (uint8x16_t)__builtin_aarch64_qtbl3v16qi(__o, (int8x16_t)idx); } -FUNK poly8x16_t vqtbl3q_p8(poly8x16x3_t tab, uint8x16_t idx) { +__funline poly8x16_t vqtbl3q_p8(poly8x16x3_t tab, uint8x16_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16229,7 +16317,7 @@ FUNK poly8x16_t vqtbl3q_p8(poly8x16x3_t tab, uint8x16_t idx) { return (poly8x16_t)__builtin_aarch64_qtbl3v16qi(__o, (int8x16_t)idx); } -FUNK int8x8_t vqtbl4_s8(int8x16x4_t tab, uint8x8_t idx) { +__funline int8x8_t vqtbl4_s8(int8x16x4_t tab, uint8x8_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16238,7 +16326,7 @@ FUNK int8x8_t vqtbl4_s8(int8x16x4_t tab, uint8x8_t idx) { return __builtin_aarch64_qtbl4v8qi(__o, (int8x8_t)idx); } -FUNK uint8x8_t vqtbl4_u8(uint8x16x4_t tab, uint8x8_t idx) { +__funline uint8x8_t vqtbl4_u8(uint8x16x4_t tab, uint8x8_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16247,7 +16335,7 @@ FUNK uint8x8_t vqtbl4_u8(uint8x16x4_t tab, uint8x8_t idx) { return (uint8x8_t)__builtin_aarch64_qtbl4v8qi(__o, (int8x8_t)idx); } -FUNK poly8x8_t vqtbl4_p8(poly8x16x4_t tab, uint8x8_t idx) { +__funline poly8x8_t vqtbl4_p8(poly8x16x4_t tab, uint8x8_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16256,7 +16344,7 @@ FUNK poly8x8_t vqtbl4_p8(poly8x16x4_t tab, uint8x8_t idx) { return (poly8x8_t)__builtin_aarch64_qtbl4v8qi(__o, (int8x8_t)idx); } -FUNK int8x16_t vqtbl4q_s8(int8x16x4_t tab, uint8x16_t idx) { +__funline int8x16_t vqtbl4q_s8(int8x16x4_t tab, uint8x16_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16265,7 +16353,7 @@ FUNK int8x16_t vqtbl4q_s8(int8x16x4_t tab, uint8x16_t idx) { return __builtin_aarch64_qtbl4v16qi(__o, (int8x16_t)idx); } -FUNK uint8x16_t vqtbl4q_u8(uint8x16x4_t tab, uint8x16_t idx) { +__funline uint8x16_t vqtbl4q_u8(uint8x16x4_t tab, uint8x16_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16274,7 +16362,7 @@ FUNK uint8x16_t vqtbl4q_u8(uint8x16x4_t tab, uint8x16_t idx) { return (uint8x16_t)__builtin_aarch64_qtbl4v16qi(__o, (int8x16_t)idx); } -FUNK poly8x16_t vqtbl4q_p8(poly8x16x4_t tab, uint8x16_t idx) { +__funline poly8x16_t vqtbl4q_p8(poly8x16x4_t tab, uint8x16_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16283,35 +16371,36 @@ FUNK poly8x16_t vqtbl4q_p8(poly8x16x4_t tab, uint8x16_t idx) { return (poly8x16_t)__builtin_aarch64_qtbl4v16qi(__o, (int8x16_t)idx); } -FUNK int8x8_t vqtbx2_s8(int8x8_t r, int8x16x2_t tab, uint8x8_t idx) { +__funline int8x8_t vqtbx2_s8(int8x8_t r, int8x16x2_t tab, uint8x8_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, tab.val[1], 1); return __builtin_aarch64_tbx4v8qi(r, __o, (int8x8_t)idx); } -FUNK uint8x8_t vqtbx2_u8(uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx) { +__funline uint8x8_t vqtbx2_u8(uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[1], 1); return (uint8x8_t)__builtin_aarch64_tbx4v8qi((int8x8_t)r, __o, (int8x8_t)idx); } -FUNK poly8x8_t vqtbx2_p8(poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx) { +__funline poly8x8_t vqtbx2_p8(poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[1], 1); return (poly8x8_t)__builtin_aarch64_tbx4v8qi((int8x8_t)r, __o, (int8x8_t)idx); } -FUNK int8x16_t vqtbx2q_s8(int8x16_t r, int8x16x2_t tab, uint8x16_t idx) { +__funline int8x16_t vqtbx2q_s8(int8x16_t r, int8x16x2_t tab, uint8x16_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, tab.val[1], 1); return __builtin_aarch64_tbx4v16qi(r, __o, (int8x16_t)idx); } -FUNK uint8x16_t vqtbx2q_u8(uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx) { +__funline uint8x16_t vqtbx2q_u8(uint8x16_t r, uint8x16x2_t tab, + uint8x16_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16319,7 +16408,8 @@ FUNK uint8x16_t vqtbx2q_u8(uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx) { (int8x16_t)idx); } -FUNK poly8x16_t vqtbx2q_p8(poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx) { +__funline poly8x16_t vqtbx2q_p8(poly8x16_t r, poly8x16x2_t tab, + uint8x16_t idx) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16327,7 +16417,7 @@ FUNK poly8x16_t vqtbx2q_p8(poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx) { (int8x16_t)idx); } -FUNK int8x8_t vqtbx3_s8(int8x8_t r, int8x16x3_t tab, uint8x8_t idx) { +__funline int8x8_t vqtbx3_s8(int8x8_t r, int8x16x3_t tab, uint8x8_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, tab.val[1], 1); @@ -16335,7 +16425,7 @@ FUNK int8x8_t vqtbx3_s8(int8x8_t r, int8x16x3_t tab, uint8x8_t idx) { return __builtin_aarch64_qtbx3v8qi(r, __o, (int8x8_t)idx); } -FUNK uint8x8_t vqtbx3_u8(uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx) { +__funline uint8x8_t vqtbx3_u8(uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16344,7 +16434,7 @@ FUNK uint8x8_t vqtbx3_u8(uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx) { (int8x8_t)idx); } -FUNK poly8x8_t vqtbx3_p8(poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx) { +__funline poly8x8_t vqtbx3_p8(poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16353,7 +16443,7 @@ FUNK poly8x8_t vqtbx3_p8(poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx) { (int8x8_t)idx); } -FUNK int8x16_t vqtbx3q_s8(int8x16_t r, int8x16x3_t tab, uint8x16_t idx) { +__funline int8x16_t vqtbx3q_s8(int8x16_t r, int8x16x3_t tab, uint8x16_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, tab.val[1], 1); @@ -16361,7 +16451,8 @@ FUNK int8x16_t vqtbx3q_s8(int8x16_t r, int8x16x3_t tab, uint8x16_t idx) { return __builtin_aarch64_qtbx3v16qi(r, __o, (int8x16_t)idx); } -FUNK uint8x16_t vqtbx3q_u8(uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx) { +__funline uint8x16_t vqtbx3q_u8(uint8x16_t r, uint8x16x3_t tab, + uint8x16_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16370,7 +16461,8 @@ FUNK uint8x16_t vqtbx3q_u8(uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx) { (int8x16_t)idx); } -FUNK poly8x16_t vqtbx3q_p8(poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx) { +__funline poly8x16_t vqtbx3q_p8(poly8x16_t r, poly8x16x3_t tab, + uint8x16_t idx) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16379,7 +16471,7 @@ FUNK poly8x16_t vqtbx3q_p8(poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx) { (int8x16_t)idx); } -FUNK int8x8_t vqtbx4_s8(int8x8_t r, int8x16x4_t tab, uint8x8_t idx) { +__funline int8x8_t vqtbx4_s8(int8x8_t r, int8x16x4_t tab, uint8x8_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, tab.val[1], 1); @@ -16388,7 +16480,7 @@ FUNK int8x8_t vqtbx4_s8(int8x8_t r, int8x16x4_t tab, uint8x8_t idx) { return __builtin_aarch64_qtbx4v8qi(r, __o, (int8x8_t)idx); } -FUNK uint8x8_t vqtbx4_u8(uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx) { +__funline uint8x8_t vqtbx4_u8(uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16398,7 +16490,7 @@ FUNK uint8x8_t vqtbx4_u8(uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx) { (int8x8_t)idx); } -FUNK poly8x8_t vqtbx4_p8(poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx) { +__funline poly8x8_t vqtbx4_p8(poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16408,7 +16500,7 @@ FUNK poly8x8_t vqtbx4_p8(poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx) { (int8x8_t)idx); } -FUNK int8x16_t vqtbx4q_s8(int8x16_t r, int8x16x4_t tab, uint8x16_t idx) { +__funline int8x16_t vqtbx4q_s8(int8x16_t r, int8x16x4_t tab, uint8x16_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, tab.val[1], 1); @@ -16417,7 +16509,8 @@ FUNK int8x16_t vqtbx4q_s8(int8x16_t r, int8x16x4_t tab, uint8x16_t idx) { return __builtin_aarch64_qtbx4v16qi(r, __o, (int8x16_t)idx); } -FUNK uint8x16_t vqtbx4q_u8(uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx) { +__funline uint8x16_t vqtbx4q_u8(uint8x16_t r, uint8x16x4_t tab, + uint8x16_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16427,7 +16520,8 @@ FUNK uint8x16_t vqtbx4q_u8(uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx) { (int8x16_t)idx); } -FUNK poly8x16_t vqtbx4q_p8(poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx) { +__funline poly8x16_t vqtbx4q_p8(poly8x16_t r, poly8x16x4_t tab, + uint8x16_t idx) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)tab.val[1], 1); @@ -16437,1470 +16531,1493 @@ FUNK poly8x16_t vqtbx4q_p8(poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx) { (int8x16_t)idx); } -FUNK poly8x8_t vrbit_p8(poly8x8_t __a) { +__funline poly8x8_t vrbit_p8(poly8x8_t __a) { return (poly8x8_t)__builtin_aarch64_rbitv8qi((int8x8_t)__a); } -FUNK int8x8_t vrbit_s8(int8x8_t __a) { +__funline int8x8_t vrbit_s8(int8x8_t __a) { return __builtin_aarch64_rbitv8qi(__a); } -FUNK uint8x8_t vrbit_u8(uint8x8_t __a) { +__funline uint8x8_t vrbit_u8(uint8x8_t __a) { return (uint8x8_t)__builtin_aarch64_rbitv8qi((int8x8_t)__a); } -FUNK poly8x16_t vrbitq_p8(poly8x16_t __a) { +__funline poly8x16_t vrbitq_p8(poly8x16_t __a) { return (poly8x16_t)__builtin_aarch64_rbitv16qi((int8x16_t)__a); } -FUNK int8x16_t vrbitq_s8(int8x16_t __a) { +__funline int8x16_t vrbitq_s8(int8x16_t __a) { return __builtin_aarch64_rbitv16qi(__a); } -FUNK uint8x16_t vrbitq_u8(uint8x16_t __a) { +__funline uint8x16_t vrbitq_u8(uint8x16_t __a) { return (uint8x16_t)__builtin_aarch64_rbitv16qi((int8x16_t)__a); } -FUNK uint32x2_t vrecpe_u32(uint32x2_t __a) { +__funline uint32x2_t vrecpe_u32(uint32x2_t __a) { return (uint32x2_t)__builtin_aarch64_urecpev2si((int32x2_t)__a); } -FUNK uint32x4_t vrecpeq_u32(uint32x4_t __a) { +__funline uint32x4_t vrecpeq_u32(uint32x4_t __a) { return (uint32x4_t)__builtin_aarch64_urecpev4si((int32x4_t)__a); } -FUNK float32_t vrecpes_f32(float32_t __a) { +__funline float32_t vrecpes_f32(float32_t __a) { return __builtin_aarch64_frecpesf(__a); } -FUNK float64_t vrecped_f64(float64_t __a) { +__funline float64_t vrecped_f64(float64_t __a) { return __builtin_aarch64_frecpedf(__a); } -FUNK float32x2_t vrecpe_f32(float32x2_t __a) { +__funline float32x2_t vrecpe_f32(float32x2_t __a) { return __builtin_aarch64_frecpev2sf(__a); } -FUNK float64x1_t vrecpe_f64(float64x1_t __a) { +__funline float64x1_t vrecpe_f64(float64x1_t __a) { return (float64x1_t){vrecped_f64(vget_lane_f64(__a, 0))}; } -FUNK float32x4_t vrecpeq_f32(float32x4_t __a) { +__funline float32x4_t vrecpeq_f32(float32x4_t __a) { return __builtin_aarch64_frecpev4sf(__a); } -FUNK float64x2_t vrecpeq_f64(float64x2_t __a) { +__funline float64x2_t vrecpeq_f64(float64x2_t __a) { return __builtin_aarch64_frecpev2df(__a); } -FUNK float32_t vrecpss_f32(float32_t __a, float32_t __b) { +__funline float32_t vrecpss_f32(float32_t __a, float32_t __b) { return __builtin_aarch64_frecpssf(__a, __b); } -FUNK float64_t vrecpsd_f64(float64_t __a, float64_t __b) { +__funline float64_t vrecpsd_f64(float64_t __a, float64_t __b) { return __builtin_aarch64_frecpsdf(__a, __b); } -FUNK float32x2_t vrecps_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vrecps_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_frecpsv2sf(__a, __b); } -FUNK float64x1_t vrecps_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vrecps_f64(float64x1_t __a, float64x1_t __b) { return (float64x1_t){ vrecpsd_f64(vget_lane_f64(__a, 0), vget_lane_f64(__b, 0))}; } -FUNK float32x4_t vrecpsq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vrecpsq_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_frecpsv4sf(__a, __b); } -FUNK float64x2_t vrecpsq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vrecpsq_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_frecpsv2df(__a, __b); } -FUNK float32_t vrecpxs_f32(float32_t __a) { +__funline float32_t vrecpxs_f32(float32_t __a) { return __builtin_aarch64_frecpxsf(__a); } -FUNK float64_t vrecpxd_f64(float64_t __a) { +__funline float64_t vrecpxd_f64(float64_t __a) { return __builtin_aarch64_frecpxdf(__a); } -FUNK poly8x8_t vrev16_p8(poly8x8_t a) { +__funline poly8x8_t vrev16_p8(poly8x8_t a) { return __builtin_shuffle(a, (uint8x8_t){1, 0, 3, 2, 5, 4, 7, 6}); } -FUNK int8x8_t vrev16_s8(int8x8_t a) { +__funline int8x8_t vrev16_s8(int8x8_t a) { return __builtin_shuffle(a, (uint8x8_t){1, 0, 3, 2, 5, 4, 7, 6}); } -FUNK uint8x8_t vrev16_u8(uint8x8_t a) { +__funline uint8x8_t vrev16_u8(uint8x8_t a) { return __builtin_shuffle(a, (uint8x8_t){1, 0, 3, 2, 5, 4, 7, 6}); } -FUNK poly8x16_t vrev16q_p8(poly8x16_t a) { +__funline poly8x16_t vrev16q_p8(poly8x16_t a) { return __builtin_shuffle( a, (uint8x16_t){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}); } -FUNK int8x16_t vrev16q_s8(int8x16_t a) { +__funline int8x16_t vrev16q_s8(int8x16_t a) { return __builtin_shuffle( a, (uint8x16_t){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}); } -FUNK uint8x16_t vrev16q_u8(uint8x16_t a) { +__funline uint8x16_t vrev16q_u8(uint8x16_t a) { return __builtin_shuffle( a, (uint8x16_t){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}); } -FUNK poly8x8_t vrev32_p8(poly8x8_t a) { +__funline poly8x8_t vrev32_p8(poly8x8_t a) { return __builtin_shuffle(a, (uint8x8_t){3, 2, 1, 0, 7, 6, 5, 4}); } -FUNK poly16x4_t vrev32_p16(poly16x4_t a) { +__funline poly16x4_t vrev32_p16(poly16x4_t a) { return __builtin_shuffle(a, (uint16x4_t){1, 0, 3, 2}); } -FUNK int8x8_t vrev32_s8(int8x8_t a) { +__funline int8x8_t vrev32_s8(int8x8_t a) { return __builtin_shuffle(a, (uint8x8_t){3, 2, 1, 0, 7, 6, 5, 4}); } -FUNK int16x4_t vrev32_s16(int16x4_t a) { +__funline int16x4_t vrev32_s16(int16x4_t a) { return __builtin_shuffle(a, (uint16x4_t){1, 0, 3, 2}); } -FUNK uint8x8_t vrev32_u8(uint8x8_t a) { +__funline uint8x8_t vrev32_u8(uint8x8_t a) { return __builtin_shuffle(a, (uint8x8_t){3, 2, 1, 0, 7, 6, 5, 4}); } -FUNK uint16x4_t vrev32_u16(uint16x4_t a) { +__funline uint16x4_t vrev32_u16(uint16x4_t a) { return __builtin_shuffle(a, (uint16x4_t){1, 0, 3, 2}); } -FUNK poly8x16_t vrev32q_p8(poly8x16_t a) { +__funline poly8x16_t vrev32q_p8(poly8x16_t a) { return __builtin_shuffle( a, (uint8x16_t){3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}); } -FUNK poly16x8_t vrev32q_p16(poly16x8_t a) { +__funline poly16x8_t vrev32q_p16(poly16x8_t a) { return __builtin_shuffle(a, (uint16x8_t){1, 0, 3, 2, 5, 4, 7, 6}); } -FUNK int8x16_t vrev32q_s8(int8x16_t a) { +__funline int8x16_t vrev32q_s8(int8x16_t a) { return __builtin_shuffle( a, (uint8x16_t){3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}); } -FUNK int16x8_t vrev32q_s16(int16x8_t a) { +__funline int16x8_t vrev32q_s16(int16x8_t a) { return __builtin_shuffle(a, (uint16x8_t){1, 0, 3, 2, 5, 4, 7, 6}); } -FUNK uint8x16_t vrev32q_u8(uint8x16_t a) { +__funline uint8x16_t vrev32q_u8(uint8x16_t a) { return __builtin_shuffle( a, (uint8x16_t){3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}); } -FUNK uint16x8_t vrev32q_u16(uint16x8_t a) { +__funline uint16x8_t vrev32q_u16(uint16x8_t a) { return __builtin_shuffle(a, (uint16x8_t){1, 0, 3, 2, 5, 4, 7, 6}); } -FUNK float16x4_t vrev64_f16(float16x4_t __a) { +__funline float16x4_t vrev64_f16(float16x4_t __a) { return __builtin_shuffle(__a, (uint16x4_t){3, 2, 1, 0}); } -FUNK float32x2_t vrev64_f32(float32x2_t a) { +__funline float32x2_t vrev64_f32(float32x2_t a) { return __builtin_shuffle(a, (uint32x2_t){1, 0}); } -FUNK poly8x8_t vrev64_p8(poly8x8_t a) { +__funline poly8x8_t vrev64_p8(poly8x8_t a) { return __builtin_shuffle(a, (uint8x8_t){7, 6, 5, 4, 3, 2, 1, 0}); } -FUNK poly16x4_t vrev64_p16(poly16x4_t a) { +__funline poly16x4_t vrev64_p16(poly16x4_t a) { return __builtin_shuffle(a, (uint16x4_t){3, 2, 1, 0}); } -FUNK int8x8_t vrev64_s8(int8x8_t a) { +__funline int8x8_t vrev64_s8(int8x8_t a) { return __builtin_shuffle(a, (uint8x8_t){7, 6, 5, 4, 3, 2, 1, 0}); } -FUNK int16x4_t vrev64_s16(int16x4_t a) { +__funline int16x4_t vrev64_s16(int16x4_t a) { return __builtin_shuffle(a, (uint16x4_t){3, 2, 1, 0}); } -FUNK int32x2_t vrev64_s32(int32x2_t a) { +__funline int32x2_t vrev64_s32(int32x2_t a) { return __builtin_shuffle(a, (uint32x2_t){1, 0}); } -FUNK uint8x8_t vrev64_u8(uint8x8_t a) { +__funline uint8x8_t vrev64_u8(uint8x8_t a) { return __builtin_shuffle(a, (uint8x8_t){7, 6, 5, 4, 3, 2, 1, 0}); } -FUNK uint16x4_t vrev64_u16(uint16x4_t a) { +__funline uint16x4_t vrev64_u16(uint16x4_t a) { return __builtin_shuffle(a, (uint16x4_t){3, 2, 1, 0}); } -FUNK uint32x2_t vrev64_u32(uint32x2_t a) { +__funline uint32x2_t vrev64_u32(uint32x2_t a) { return __builtin_shuffle(a, (uint32x2_t){1, 0}); } -FUNK float16x8_t vrev64q_f16(float16x8_t __a) { +__funline float16x8_t vrev64q_f16(float16x8_t __a) { return __builtin_shuffle(__a, (uint16x8_t){3, 2, 1, 0, 7, 6, 5, 4}); } -FUNK float32x4_t vrev64q_f32(float32x4_t a) { +__funline float32x4_t vrev64q_f32(float32x4_t a) { return __builtin_shuffle(a, (uint32x4_t){1, 0, 3, 2}); } -FUNK poly8x16_t vrev64q_p8(poly8x16_t a) { +__funline poly8x16_t vrev64q_p8(poly8x16_t a) { return __builtin_shuffle( a, (uint8x16_t){7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}); } -FUNK poly16x8_t vrev64q_p16(poly16x8_t a) { +__funline poly16x8_t vrev64q_p16(poly16x8_t a) { return __builtin_shuffle(a, (uint16x8_t){3, 2, 1, 0, 7, 6, 5, 4}); } -FUNK int8x16_t vrev64q_s8(int8x16_t a) { +__funline int8x16_t vrev64q_s8(int8x16_t a) { return __builtin_shuffle( a, (uint8x16_t){7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}); } -FUNK int16x8_t vrev64q_s16(int16x8_t a) { +__funline int16x8_t vrev64q_s16(int16x8_t a) { return __builtin_shuffle(a, (uint16x8_t){3, 2, 1, 0, 7, 6, 5, 4}); } -FUNK int32x4_t vrev64q_s32(int32x4_t a) { +__funline int32x4_t vrev64q_s32(int32x4_t a) { return __builtin_shuffle(a, (uint32x4_t){1, 0, 3, 2}); } -FUNK uint8x16_t vrev64q_u8(uint8x16_t a) { +__funline uint8x16_t vrev64q_u8(uint8x16_t a) { return __builtin_shuffle( a, (uint8x16_t){7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}); } -FUNK uint16x8_t vrev64q_u16(uint16x8_t a) { +__funline uint16x8_t vrev64q_u16(uint16x8_t a) { return __builtin_shuffle(a, (uint16x8_t){3, 2, 1, 0, 7, 6, 5, 4}); } -FUNK uint32x4_t vrev64q_u32(uint32x4_t a) { +__funline uint32x4_t vrev64q_u32(uint32x4_t a) { return __builtin_shuffle(a, (uint32x4_t){1, 0, 3, 2}); } -FUNK float32x2_t vrnd_f32(float32x2_t __a) { +__funline float32x2_t vrnd_f32(float32x2_t __a) { return __builtin_aarch64_btruncv2sf(__a); } -FUNK float64x1_t vrnd_f64(float64x1_t __a) { +__funline float64x1_t vrnd_f64(float64x1_t __a) { return vset_lane_f64(__builtin_trunc(vget_lane_f64(__a, 0)), __a, 0); } -FUNK float32x4_t vrndq_f32(float32x4_t __a) { +__funline float32x4_t vrndq_f32(float32x4_t __a) { return __builtin_aarch64_btruncv4sf(__a); } -FUNK float64x2_t vrndq_f64(float64x2_t __a) { +__funline float64x2_t vrndq_f64(float64x2_t __a) { return __builtin_aarch64_btruncv2df(__a); } -FUNK float32x2_t vrnda_f32(float32x2_t __a) { +__funline float32x2_t vrnda_f32(float32x2_t __a) { return __builtin_aarch64_roundv2sf(__a); } -FUNK float64x1_t vrnda_f64(float64x1_t __a) { +__funline float64x1_t vrnda_f64(float64x1_t __a) { return vset_lane_f64(__builtin_round(vget_lane_f64(__a, 0)), __a, 0); } -FUNK float32x4_t vrndaq_f32(float32x4_t __a) { +__funline float32x4_t vrndaq_f32(float32x4_t __a) { return __builtin_aarch64_roundv4sf(__a); } -FUNK float64x2_t vrndaq_f64(float64x2_t __a) { +__funline float64x2_t vrndaq_f64(float64x2_t __a) { return __builtin_aarch64_roundv2df(__a); } -FUNK float32x2_t vrndi_f32(float32x2_t __a) { +__funline float32x2_t vrndi_f32(float32x2_t __a) { return __builtin_aarch64_nearbyintv2sf(__a); } -FUNK float64x1_t vrndi_f64(float64x1_t __a) { +__funline float64x1_t vrndi_f64(float64x1_t __a) { return vset_lane_f64(__builtin_nearbyint(vget_lane_f64(__a, 0)), __a, 0); } -FUNK float32x4_t vrndiq_f32(float32x4_t __a) { +__funline float32x4_t vrndiq_f32(float32x4_t __a) { return __builtin_aarch64_nearbyintv4sf(__a); } -FUNK float64x2_t vrndiq_f64(float64x2_t __a) { +__funline float64x2_t vrndiq_f64(float64x2_t __a) { return __builtin_aarch64_nearbyintv2df(__a); } -FUNK float32x2_t vrndm_f32(float32x2_t __a) { +__funline float32x2_t vrndm_f32(float32x2_t __a) { return __builtin_aarch64_floorv2sf(__a); } -FUNK float64x1_t vrndm_f64(float64x1_t __a) { +__funline float64x1_t vrndm_f64(float64x1_t __a) { return vset_lane_f64(__builtin_floor(vget_lane_f64(__a, 0)), __a, 0); } -FUNK float32x4_t vrndmq_f32(float32x4_t __a) { +__funline float32x4_t vrndmq_f32(float32x4_t __a) { return __builtin_aarch64_floorv4sf(__a); } -FUNK float64x2_t vrndmq_f64(float64x2_t __a) { +__funline float64x2_t vrndmq_f64(float64x2_t __a) { return __builtin_aarch64_floorv2df(__a); } -FUNK float32x2_t vrndn_f32(float32x2_t __a) { +__funline float32x2_t vrndn_f32(float32x2_t __a) { return __builtin_aarch64_frintnv2sf(__a); } -FUNK float64x1_t vrndn_f64(float64x1_t __a) { +__funline float64x1_t vrndn_f64(float64x1_t __a) { return (float64x1_t){__builtin_aarch64_frintndf(__a[0])}; } -FUNK float32x4_t vrndnq_f32(float32x4_t __a) { +__funline float32x4_t vrndnq_f32(float32x4_t __a) { return __builtin_aarch64_frintnv4sf(__a); } -FUNK float64x2_t vrndnq_f64(float64x2_t __a) { +__funline float64x2_t vrndnq_f64(float64x2_t __a) { return __builtin_aarch64_frintnv2df(__a); } -FUNK float32x2_t vrndp_f32(float32x2_t __a) { +__funline float32x2_t vrndp_f32(float32x2_t __a) { return __builtin_aarch64_ceilv2sf(__a); } -FUNK float64x1_t vrndp_f64(float64x1_t __a) { +__funline float64x1_t vrndp_f64(float64x1_t __a) { return vset_lane_f64(__builtin_ceil(vget_lane_f64(__a, 0)), __a, 0); } -FUNK float32x4_t vrndpq_f32(float32x4_t __a) { +__funline float32x4_t vrndpq_f32(float32x4_t __a) { return __builtin_aarch64_ceilv4sf(__a); } -FUNK float64x2_t vrndpq_f64(float64x2_t __a) { +__funline float64x2_t vrndpq_f64(float64x2_t __a) { return __builtin_aarch64_ceilv2df(__a); } -FUNK float32x2_t vrndx_f32(float32x2_t __a) { +__funline float32x2_t vrndx_f32(float32x2_t __a) { return __builtin_aarch64_rintv2sf(__a); } -FUNK float64x1_t vrndx_f64(float64x1_t __a) { +__funline float64x1_t vrndx_f64(float64x1_t __a) { return vset_lane_f64(__builtin_rint(vget_lane_f64(__a, 0)), __a, 0); } -FUNK float32x4_t vrndxq_f32(float32x4_t __a) { +__funline float32x4_t vrndxq_f32(float32x4_t __a) { return __builtin_aarch64_rintv4sf(__a); } -FUNK float64x2_t vrndxq_f64(float64x2_t __a) { +__funline float64x2_t vrndxq_f64(float64x2_t __a) { return __builtin_aarch64_rintv2df(__a); } -FUNK int8x8_t vrshl_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vrshl_s8(int8x8_t __a, int8x8_t __b) { return (int8x8_t)__builtin_aarch64_srshlv8qi(__a, __b); } -FUNK int16x4_t vrshl_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vrshl_s16(int16x4_t __a, int16x4_t __b) { return (int16x4_t)__builtin_aarch64_srshlv4hi(__a, __b); } -FUNK int32x2_t vrshl_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vrshl_s32(int32x2_t __a, int32x2_t __b) { return (int32x2_t)__builtin_aarch64_srshlv2si(__a, __b); } -FUNK int64x1_t vrshl_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vrshl_s64(int64x1_t __a, int64x1_t __b) { return (int64x1_t){__builtin_aarch64_srshldi(__a[0], __b[0])}; } -FUNK uint8x8_t vrshl_u8(uint8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vrshl_u8(uint8x8_t __a, int8x8_t __b) { return __builtin_aarch64_urshlv8qi_uus(__a, __b); } -FUNK uint16x4_t vrshl_u16(uint16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vrshl_u16(uint16x4_t __a, int16x4_t __b) { return __builtin_aarch64_urshlv4hi_uus(__a, __b); } -FUNK uint32x2_t vrshl_u32(uint32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vrshl_u32(uint32x2_t __a, int32x2_t __b) { return __builtin_aarch64_urshlv2si_uus(__a, __b); } -FUNK uint64x1_t vrshl_u64(uint64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vrshl_u64(uint64x1_t __a, int64x1_t __b) { return (uint64x1_t){__builtin_aarch64_urshldi_uus(__a[0], __b[0])}; } -FUNK int8x16_t vrshlq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vrshlq_s8(int8x16_t __a, int8x16_t __b) { return (int8x16_t)__builtin_aarch64_srshlv16qi(__a, __b); } -FUNK int16x8_t vrshlq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vrshlq_s16(int16x8_t __a, int16x8_t __b) { return (int16x8_t)__builtin_aarch64_srshlv8hi(__a, __b); } -FUNK int32x4_t vrshlq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vrshlq_s32(int32x4_t __a, int32x4_t __b) { return (int32x4_t)__builtin_aarch64_srshlv4si(__a, __b); } -FUNK int64x2_t vrshlq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vrshlq_s64(int64x2_t __a, int64x2_t __b) { return (int64x2_t)__builtin_aarch64_srshlv2di(__a, __b); } -FUNK uint8x16_t vrshlq_u8(uint8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vrshlq_u8(uint8x16_t __a, int8x16_t __b) { return __builtin_aarch64_urshlv16qi_uus(__a, __b); } -FUNK uint16x8_t vrshlq_u16(uint16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vrshlq_u16(uint16x8_t __a, int16x8_t __b) { return __builtin_aarch64_urshlv8hi_uus(__a, __b); } -FUNK uint32x4_t vrshlq_u32(uint32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vrshlq_u32(uint32x4_t __a, int32x4_t __b) { return __builtin_aarch64_urshlv4si_uus(__a, __b); } -FUNK uint64x2_t vrshlq_u64(uint64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vrshlq_u64(uint64x2_t __a, int64x2_t __b) { return __builtin_aarch64_urshlv2di_uus(__a, __b); } -FUNK int64_t vrshld_s64(int64_t __a, int64_t __b) { +__funline int64_t vrshld_s64(int64_t __a, int64_t __b) { return __builtin_aarch64_srshldi(__a, __b); } -FUNK uint64_t vrshld_u64(uint64_t __a, int64_t __b) { +__funline uint64_t vrshld_u64(uint64_t __a, int64_t __b) { return __builtin_aarch64_urshldi_uus(__a, __b); } -FUNK int8x8_t vrshr_n_s8(int8x8_t __a, const int __b) { +__funline int8x8_t vrshr_n_s8(int8x8_t __a, const int __b) { return (int8x8_t)__builtin_aarch64_srshr_nv8qi(__a, __b); } -FUNK int16x4_t vrshr_n_s16(int16x4_t __a, const int __b) { +__funline int16x4_t vrshr_n_s16(int16x4_t __a, const int __b) { return (int16x4_t)__builtin_aarch64_srshr_nv4hi(__a, __b); } -FUNK int32x2_t vrshr_n_s32(int32x2_t __a, const int __b) { +__funline int32x2_t vrshr_n_s32(int32x2_t __a, const int __b) { return (int32x2_t)__builtin_aarch64_srshr_nv2si(__a, __b); } -FUNK int64x1_t vrshr_n_s64(int64x1_t __a, const int __b) { +__funline int64x1_t vrshr_n_s64(int64x1_t __a, const int __b) { return (int64x1_t){__builtin_aarch64_srshr_ndi(__a[0], __b)}; } -FUNK uint8x8_t vrshr_n_u8(uint8x8_t __a, const int __b) { +__funline uint8x8_t vrshr_n_u8(uint8x8_t __a, const int __b) { return __builtin_aarch64_urshr_nv8qi_uus(__a, __b); } -FUNK uint16x4_t vrshr_n_u16(uint16x4_t __a, const int __b) { +__funline uint16x4_t vrshr_n_u16(uint16x4_t __a, const int __b) { return __builtin_aarch64_urshr_nv4hi_uus(__a, __b); } -FUNK uint32x2_t vrshr_n_u32(uint32x2_t __a, const int __b) { +__funline uint32x2_t vrshr_n_u32(uint32x2_t __a, const int __b) { return __builtin_aarch64_urshr_nv2si_uus(__a, __b); } -FUNK uint64x1_t vrshr_n_u64(uint64x1_t __a, const int __b) { +__funline uint64x1_t vrshr_n_u64(uint64x1_t __a, const int __b) { return (uint64x1_t){__builtin_aarch64_urshr_ndi_uus(__a[0], __b)}; } -FUNK int8x16_t vrshrq_n_s8(int8x16_t __a, const int __b) { +__funline int8x16_t vrshrq_n_s8(int8x16_t __a, const int __b) { return (int8x16_t)__builtin_aarch64_srshr_nv16qi(__a, __b); } -FUNK int16x8_t vrshrq_n_s16(int16x8_t __a, const int __b) { +__funline int16x8_t vrshrq_n_s16(int16x8_t __a, const int __b) { return (int16x8_t)__builtin_aarch64_srshr_nv8hi(__a, __b); } -FUNK int32x4_t vrshrq_n_s32(int32x4_t __a, const int __b) { +__funline int32x4_t vrshrq_n_s32(int32x4_t __a, const int __b) { return (int32x4_t)__builtin_aarch64_srshr_nv4si(__a, __b); } -FUNK int64x2_t vrshrq_n_s64(int64x2_t __a, const int __b) { +__funline int64x2_t vrshrq_n_s64(int64x2_t __a, const int __b) { return (int64x2_t)__builtin_aarch64_srshr_nv2di(__a, __b); } -FUNK uint8x16_t vrshrq_n_u8(uint8x16_t __a, const int __b) { +__funline uint8x16_t vrshrq_n_u8(uint8x16_t __a, const int __b) { return __builtin_aarch64_urshr_nv16qi_uus(__a, __b); } -FUNK uint16x8_t vrshrq_n_u16(uint16x8_t __a, const int __b) { +__funline uint16x8_t vrshrq_n_u16(uint16x8_t __a, const int __b) { return __builtin_aarch64_urshr_nv8hi_uus(__a, __b); } -FUNK uint32x4_t vrshrq_n_u32(uint32x4_t __a, const int __b) { +__funline uint32x4_t vrshrq_n_u32(uint32x4_t __a, const int __b) { return __builtin_aarch64_urshr_nv4si_uus(__a, __b); } -FUNK uint64x2_t vrshrq_n_u64(uint64x2_t __a, const int __b) { +__funline uint64x2_t vrshrq_n_u64(uint64x2_t __a, const int __b) { return __builtin_aarch64_urshr_nv2di_uus(__a, __b); } -FUNK int64_t vrshrd_n_s64(int64_t __a, const int __b) { +__funline int64_t vrshrd_n_s64(int64_t __a, const int __b) { return __builtin_aarch64_srshr_ndi(__a, __b); } -FUNK uint64_t vrshrd_n_u64(uint64_t __a, const int __b) { +__funline uint64_t vrshrd_n_u64(uint64_t __a, const int __b) { return __builtin_aarch64_urshr_ndi_uus(__a, __b); } -FUNK float32_t vrsqrtes_f32(float32_t __a) { +__funline float32_t vrsqrtes_f32(float32_t __a) { return __builtin_aarch64_rsqrtesf(__a); } -FUNK float64_t vrsqrted_f64(float64_t __a) { +__funline float64_t vrsqrted_f64(float64_t __a) { return __builtin_aarch64_rsqrtedf(__a); } -FUNK float32x2_t vrsqrte_f32(float32x2_t __a) { +__funline float32x2_t vrsqrte_f32(float32x2_t __a) { return __builtin_aarch64_rsqrtev2sf(__a); } -FUNK float64x1_t vrsqrte_f64(float64x1_t __a) { +__funline float64x1_t vrsqrte_f64(float64x1_t __a) { return (float64x1_t){vrsqrted_f64(vget_lane_f64(__a, 0))}; } -FUNK float32x4_t vrsqrteq_f32(float32x4_t __a) { +__funline float32x4_t vrsqrteq_f32(float32x4_t __a) { return __builtin_aarch64_rsqrtev4sf(__a); } -FUNK float64x2_t vrsqrteq_f64(float64x2_t __a) { +__funline float64x2_t vrsqrteq_f64(float64x2_t __a) { return __builtin_aarch64_rsqrtev2df(__a); } -FUNK float32_t vrsqrtss_f32(float32_t __a, float32_t __b) { +__funline float32_t vrsqrtss_f32(float32_t __a, float32_t __b) { return __builtin_aarch64_rsqrtssf(__a, __b); } -FUNK float64_t vrsqrtsd_f64(float64_t __a, float64_t __b) { +__funline float64_t vrsqrtsd_f64(float64_t __a, float64_t __b) { return __builtin_aarch64_rsqrtsdf(__a, __b); } -FUNK float32x2_t vrsqrts_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vrsqrts_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_rsqrtsv2sf(__a, __b); } -FUNK float64x1_t vrsqrts_f64(float64x1_t __a, float64x1_t __b) { +__funline float64x1_t vrsqrts_f64(float64x1_t __a, float64x1_t __b) { return (float64x1_t){ vrsqrtsd_f64(vget_lane_f64(__a, 0), vget_lane_f64(__b, 0))}; } -FUNK float32x4_t vrsqrtsq_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vrsqrtsq_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_rsqrtsv4sf(__a, __b); } -FUNK float64x2_t vrsqrtsq_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vrsqrtsq_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_rsqrtsv2df(__a, __b); } -FUNK int8x8_t vrsra_n_s8(int8x8_t __a, int8x8_t __b, const int __c) { +__funline int8x8_t vrsra_n_s8(int8x8_t __a, int8x8_t __b, const int __c) { return (int8x8_t)__builtin_aarch64_srsra_nv8qi(__a, __b, __c); } -FUNK int16x4_t vrsra_n_s16(int16x4_t __a, int16x4_t __b, const int __c) { +__funline int16x4_t vrsra_n_s16(int16x4_t __a, int16x4_t __b, const int __c) { return (int16x4_t)__builtin_aarch64_srsra_nv4hi(__a, __b, __c); } -FUNK int32x2_t vrsra_n_s32(int32x2_t __a, int32x2_t __b, const int __c) { +__funline int32x2_t vrsra_n_s32(int32x2_t __a, int32x2_t __b, const int __c) { return (int32x2_t)__builtin_aarch64_srsra_nv2si(__a, __b, __c); } -FUNK int64x1_t vrsra_n_s64(int64x1_t __a, int64x1_t __b, const int __c) { +__funline int64x1_t vrsra_n_s64(int64x1_t __a, int64x1_t __b, const int __c) { return (int64x1_t){__builtin_aarch64_srsra_ndi(__a[0], __b[0], __c)}; } -FUNK uint8x8_t vrsra_n_u8(uint8x8_t __a, uint8x8_t __b, const int __c) { +__funline uint8x8_t vrsra_n_u8(uint8x8_t __a, uint8x8_t __b, const int __c) { return __builtin_aarch64_ursra_nv8qi_uuus(__a, __b, __c); } -FUNK uint16x4_t vrsra_n_u16(uint16x4_t __a, uint16x4_t __b, const int __c) { +__funline uint16x4_t vrsra_n_u16(uint16x4_t __a, uint16x4_t __b, + const int __c) { return __builtin_aarch64_ursra_nv4hi_uuus(__a, __b, __c); } -FUNK uint32x2_t vrsra_n_u32(uint32x2_t __a, uint32x2_t __b, const int __c) { +__funline uint32x2_t vrsra_n_u32(uint32x2_t __a, uint32x2_t __b, + const int __c) { return __builtin_aarch64_ursra_nv2si_uuus(__a, __b, __c); } -FUNK uint64x1_t vrsra_n_u64(uint64x1_t __a, uint64x1_t __b, const int __c) { +__funline uint64x1_t vrsra_n_u64(uint64x1_t __a, uint64x1_t __b, + const int __c) { return (uint64x1_t){__builtin_aarch64_ursra_ndi_uuus(__a[0], __b[0], __c)}; } -FUNK int8x16_t vrsraq_n_s8(int8x16_t __a, int8x16_t __b, const int __c) { +__funline int8x16_t vrsraq_n_s8(int8x16_t __a, int8x16_t __b, const int __c) { return (int8x16_t)__builtin_aarch64_srsra_nv16qi(__a, __b, __c); } -FUNK int16x8_t vrsraq_n_s16(int16x8_t __a, int16x8_t __b, const int __c) { +__funline int16x8_t vrsraq_n_s16(int16x8_t __a, int16x8_t __b, const int __c) { return (int16x8_t)__builtin_aarch64_srsra_nv8hi(__a, __b, __c); } -FUNK int32x4_t vrsraq_n_s32(int32x4_t __a, int32x4_t __b, const int __c) { +__funline int32x4_t vrsraq_n_s32(int32x4_t __a, int32x4_t __b, const int __c) { return (int32x4_t)__builtin_aarch64_srsra_nv4si(__a, __b, __c); } -FUNK int64x2_t vrsraq_n_s64(int64x2_t __a, int64x2_t __b, const int __c) { +__funline int64x2_t vrsraq_n_s64(int64x2_t __a, int64x2_t __b, const int __c) { return (int64x2_t)__builtin_aarch64_srsra_nv2di(__a, __b, __c); } -FUNK uint8x16_t vrsraq_n_u8(uint8x16_t __a, uint8x16_t __b, const int __c) { +__funline uint8x16_t vrsraq_n_u8(uint8x16_t __a, uint8x16_t __b, + const int __c) { return __builtin_aarch64_ursra_nv16qi_uuus(__a, __b, __c); } -FUNK uint16x8_t vrsraq_n_u16(uint16x8_t __a, uint16x8_t __b, const int __c) { +__funline uint16x8_t vrsraq_n_u16(uint16x8_t __a, uint16x8_t __b, + const int __c) { return __builtin_aarch64_ursra_nv8hi_uuus(__a, __b, __c); } -FUNK uint32x4_t vrsraq_n_u32(uint32x4_t __a, uint32x4_t __b, const int __c) { +__funline uint32x4_t vrsraq_n_u32(uint32x4_t __a, uint32x4_t __b, + const int __c) { return __builtin_aarch64_ursra_nv4si_uuus(__a, __b, __c); } -FUNK uint64x2_t vrsraq_n_u64(uint64x2_t __a, uint64x2_t __b, const int __c) { +__funline uint64x2_t vrsraq_n_u64(uint64x2_t __a, uint64x2_t __b, + const int __c) { return __builtin_aarch64_ursra_nv2di_uuus(__a, __b, __c); } -FUNK int64_t vrsrad_n_s64(int64_t __a, int64_t __b, const int __c) { +__funline int64_t vrsrad_n_s64(int64_t __a, int64_t __b, const int __c) { return __builtin_aarch64_srsra_ndi(__a, __b, __c); } -FUNK uint64_t vrsrad_n_u64(uint64_t __a, uint64_t __b, const int __c) { +__funline uint64_t vrsrad_n_u64(uint64_t __a, uint64_t __b, const int __c) { return __builtin_aarch64_ursra_ndi_uuus(__a, __b, __c); } #pragma GCC push_options #pragma GCC target("+nothing+crypto") -FUNK uint32x4_t vsha1cq_u32(uint32x4_t hash_abcd, uint32_t hash_e, - uint32x4_t wk) { +__funline uint32x4_t vsha1cq_u32(uint32x4_t hash_abcd, uint32_t hash_e, + uint32x4_t wk) { return __builtin_aarch64_crypto_sha1cv4si_uuuu(hash_abcd, hash_e, wk); } -FUNK uint32x4_t vsha1mq_u32(uint32x4_t hash_abcd, uint32_t hash_e, - uint32x4_t wk) { +__funline uint32x4_t vsha1mq_u32(uint32x4_t hash_abcd, uint32_t hash_e, + uint32x4_t wk) { return __builtin_aarch64_crypto_sha1mv4si_uuuu(hash_abcd, hash_e, wk); } -FUNK uint32x4_t vsha1pq_u32(uint32x4_t hash_abcd, uint32_t hash_e, - uint32x4_t wk) { +__funline uint32x4_t vsha1pq_u32(uint32x4_t hash_abcd, uint32_t hash_e, + uint32x4_t wk) { return __builtin_aarch64_crypto_sha1pv4si_uuuu(hash_abcd, hash_e, wk); } -FUNK uint32_t vsha1h_u32(uint32_t hash_e) { +__funline uint32_t vsha1h_u32(uint32_t hash_e) { return __builtin_aarch64_crypto_sha1hsi_uu(hash_e); } -FUNK uint32x4_t vsha1su0q_u32(uint32x4_t w0_3, uint32x4_t w4_7, - uint32x4_t w8_11) { +__funline uint32x4_t vsha1su0q_u32(uint32x4_t w0_3, uint32x4_t w4_7, + uint32x4_t w8_11) { return __builtin_aarch64_crypto_sha1su0v4si_uuuu(w0_3, w4_7, w8_11); } -FUNK uint32x4_t vsha1su1q_u32(uint32x4_t tw0_3, uint32x4_t w12_15) { +__funline uint32x4_t vsha1su1q_u32(uint32x4_t tw0_3, uint32x4_t w12_15) { return __builtin_aarch64_crypto_sha1su1v4si_uuu(tw0_3, w12_15); } -FUNK uint32x4_t vsha256hq_u32(uint32x4_t hash_abcd, uint32x4_t hash_efgh, - uint32x4_t wk) { +__funline uint32x4_t vsha256hq_u32(uint32x4_t hash_abcd, uint32x4_t hash_efgh, + uint32x4_t wk) { return __builtin_aarch64_crypto_sha256hv4si_uuuu(hash_abcd, hash_efgh, wk); } -FUNK uint32x4_t vsha256h2q_u32(uint32x4_t hash_efgh, uint32x4_t hash_abcd, - uint32x4_t wk) { +__funline uint32x4_t vsha256h2q_u32(uint32x4_t hash_efgh, uint32x4_t hash_abcd, + uint32x4_t wk) { return __builtin_aarch64_crypto_sha256h2v4si_uuuu(hash_efgh, hash_abcd, wk); } -FUNK uint32x4_t vsha256su0q_u32(uint32x4_t w0_3, uint32x4_t w4_7) { +__funline uint32x4_t vsha256su0q_u32(uint32x4_t w0_3, uint32x4_t w4_7) { return __builtin_aarch64_crypto_sha256su0v4si_uuu(w0_3, w4_7); } -FUNK uint32x4_t vsha256su1q_u32(uint32x4_t tw0_3, uint32x4_t w8_11, - uint32x4_t w12_15) { +__funline uint32x4_t vsha256su1q_u32(uint32x4_t tw0_3, uint32x4_t w8_11, + uint32x4_t w12_15) { return __builtin_aarch64_crypto_sha256su1v4si_uuuu(tw0_3, w8_11, w12_15); } -FUNK poly128_t vmull_p64(poly64_t a, poly64_t b) { +__funline poly128_t vmull_p64(poly64_t a, poly64_t b) { return __builtin_aarch64_crypto_pmulldi_ppp(a, b); } -FUNK poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b) { +__funline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b) { return __builtin_aarch64_crypto_pmullv2di_ppp(a, b); } #pragma GCC pop_options -FUNK int8x8_t vshl_n_s8(int8x8_t __a, const int __b) { +__funline int8x8_t vshl_n_s8(int8x8_t __a, const int __b) { return (int8x8_t)__builtin_aarch64_ashlv8qi(__a, __b); } -FUNK int16x4_t vshl_n_s16(int16x4_t __a, const int __b) { +__funline int16x4_t vshl_n_s16(int16x4_t __a, const int __b) { return (int16x4_t)__builtin_aarch64_ashlv4hi(__a, __b); } -FUNK int32x2_t vshl_n_s32(int32x2_t __a, const int __b) { +__funline int32x2_t vshl_n_s32(int32x2_t __a, const int __b) { return (int32x2_t)__builtin_aarch64_ashlv2si(__a, __b); } -FUNK int64x1_t vshl_n_s64(int64x1_t __a, const int __b) { +__funline int64x1_t vshl_n_s64(int64x1_t __a, const int __b) { return (int64x1_t){__builtin_aarch64_ashldi(__a[0], __b)}; } -FUNK uint8x8_t vshl_n_u8(uint8x8_t __a, const int __b) { +__funline uint8x8_t vshl_n_u8(uint8x8_t __a, const int __b) { return (uint8x8_t)__builtin_aarch64_ashlv8qi((int8x8_t)__a, __b); } -FUNK uint16x4_t vshl_n_u16(uint16x4_t __a, const int __b) { +__funline uint16x4_t vshl_n_u16(uint16x4_t __a, const int __b) { return (uint16x4_t)__builtin_aarch64_ashlv4hi((int16x4_t)__a, __b); } -FUNK uint32x2_t vshl_n_u32(uint32x2_t __a, const int __b) { +__funline uint32x2_t vshl_n_u32(uint32x2_t __a, const int __b) { return (uint32x2_t)__builtin_aarch64_ashlv2si((int32x2_t)__a, __b); } -FUNK uint64x1_t vshl_n_u64(uint64x1_t __a, const int __b) { +__funline uint64x1_t vshl_n_u64(uint64x1_t __a, const int __b) { return (uint64x1_t){__builtin_aarch64_ashldi((int64_t)__a[0], __b)}; } -FUNK int8x16_t vshlq_n_s8(int8x16_t __a, const int __b) { +__funline int8x16_t vshlq_n_s8(int8x16_t __a, const int __b) { return (int8x16_t)__builtin_aarch64_ashlv16qi(__a, __b); } -FUNK int16x8_t vshlq_n_s16(int16x8_t __a, const int __b) { +__funline int16x8_t vshlq_n_s16(int16x8_t __a, const int __b) { return (int16x8_t)__builtin_aarch64_ashlv8hi(__a, __b); } -FUNK int32x4_t vshlq_n_s32(int32x4_t __a, const int __b) { +__funline int32x4_t vshlq_n_s32(int32x4_t __a, const int __b) { return (int32x4_t)__builtin_aarch64_ashlv4si(__a, __b); } -FUNK int64x2_t vshlq_n_s64(int64x2_t __a, const int __b) { +__funline int64x2_t vshlq_n_s64(int64x2_t __a, const int __b) { return (int64x2_t)__builtin_aarch64_ashlv2di(__a, __b); } -FUNK uint8x16_t vshlq_n_u8(uint8x16_t __a, const int __b) { +__funline uint8x16_t vshlq_n_u8(uint8x16_t __a, const int __b) { return (uint8x16_t)__builtin_aarch64_ashlv16qi((int8x16_t)__a, __b); } -FUNK uint16x8_t vshlq_n_u16(uint16x8_t __a, const int __b) { +__funline uint16x8_t vshlq_n_u16(uint16x8_t __a, const int __b) { return (uint16x8_t)__builtin_aarch64_ashlv8hi((int16x8_t)__a, __b); } -FUNK uint32x4_t vshlq_n_u32(uint32x4_t __a, const int __b) { +__funline uint32x4_t vshlq_n_u32(uint32x4_t __a, const int __b) { return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__a, __b); } -FUNK uint64x2_t vshlq_n_u64(uint64x2_t __a, const int __b) { +__funline uint64x2_t vshlq_n_u64(uint64x2_t __a, const int __b) { return (uint64x2_t)__builtin_aarch64_ashlv2di((int64x2_t)__a, __b); } -FUNK int64_t vshld_n_s64(int64_t __a, const int __b) { +__funline int64_t vshld_n_s64(int64_t __a, const int __b) { return __builtin_aarch64_ashldi(__a, __b); } -FUNK uint64_t vshld_n_u64(uint64_t __a, const int __b) { +__funline uint64_t vshld_n_u64(uint64_t __a, const int __b) { return (uint64_t)__builtin_aarch64_ashldi(__a, __b); } -FUNK int8x8_t vshl_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vshl_s8(int8x8_t __a, int8x8_t __b) { return __builtin_aarch64_sshlv8qi(__a, __b); } -FUNK int16x4_t vshl_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vshl_s16(int16x4_t __a, int16x4_t __b) { return __builtin_aarch64_sshlv4hi(__a, __b); } -FUNK int32x2_t vshl_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vshl_s32(int32x2_t __a, int32x2_t __b) { return __builtin_aarch64_sshlv2si(__a, __b); } -FUNK int64x1_t vshl_s64(int64x1_t __a, int64x1_t __b) { +__funline int64x1_t vshl_s64(int64x1_t __a, int64x1_t __b) { return (int64x1_t){__builtin_aarch64_sshldi(__a[0], __b[0])}; } -FUNK uint8x8_t vshl_u8(uint8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vshl_u8(uint8x8_t __a, int8x8_t __b) { return __builtin_aarch64_ushlv8qi_uus(__a, __b); } -FUNK uint16x4_t vshl_u16(uint16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vshl_u16(uint16x4_t __a, int16x4_t __b) { return __builtin_aarch64_ushlv4hi_uus(__a, __b); } -FUNK uint32x2_t vshl_u32(uint32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vshl_u32(uint32x2_t __a, int32x2_t __b) { return __builtin_aarch64_ushlv2si_uus(__a, __b); } -FUNK uint64x1_t vshl_u64(uint64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vshl_u64(uint64x1_t __a, int64x1_t __b) { return (uint64x1_t){__builtin_aarch64_ushldi_uus(__a[0], __b[0])}; } -FUNK int8x16_t vshlq_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vshlq_s8(int8x16_t __a, int8x16_t __b) { return __builtin_aarch64_sshlv16qi(__a, __b); } -FUNK int16x8_t vshlq_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vshlq_s16(int16x8_t __a, int16x8_t __b) { return __builtin_aarch64_sshlv8hi(__a, __b); } -FUNK int32x4_t vshlq_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vshlq_s32(int32x4_t __a, int32x4_t __b) { return __builtin_aarch64_sshlv4si(__a, __b); } -FUNK int64x2_t vshlq_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vshlq_s64(int64x2_t __a, int64x2_t __b) { return __builtin_aarch64_sshlv2di(__a, __b); } -FUNK uint8x16_t vshlq_u8(uint8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vshlq_u8(uint8x16_t __a, int8x16_t __b) { return __builtin_aarch64_ushlv16qi_uus(__a, __b); } -FUNK uint16x8_t vshlq_u16(uint16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vshlq_u16(uint16x8_t __a, int16x8_t __b) { return __builtin_aarch64_ushlv8hi_uus(__a, __b); } -FUNK uint32x4_t vshlq_u32(uint32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vshlq_u32(uint32x4_t __a, int32x4_t __b) { return __builtin_aarch64_ushlv4si_uus(__a, __b); } -FUNK uint64x2_t vshlq_u64(uint64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vshlq_u64(uint64x2_t __a, int64x2_t __b) { return __builtin_aarch64_ushlv2di_uus(__a, __b); } -FUNK int64_t vshld_s64(int64_t __a, int64_t __b) { +__funline int64_t vshld_s64(int64_t __a, int64_t __b) { return __builtin_aarch64_sshldi(__a, __b); } -FUNK uint64_t vshld_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vshld_u64(uint64_t __a, uint64_t __b) { return __builtin_aarch64_ushldi_uus(__a, __b); } -FUNK int16x8_t vshll_high_n_s8(int8x16_t __a, const int __b) { +__funline int16x8_t vshll_high_n_s8(int8x16_t __a, const int __b) { return __builtin_aarch64_sshll2_nv16qi(__a, __b); } -FUNK int32x4_t vshll_high_n_s16(int16x8_t __a, const int __b) { +__funline int32x4_t vshll_high_n_s16(int16x8_t __a, const int __b) { return __builtin_aarch64_sshll2_nv8hi(__a, __b); } -FUNK int64x2_t vshll_high_n_s32(int32x4_t __a, const int __b) { +__funline int64x2_t vshll_high_n_s32(int32x4_t __a, const int __b) { return __builtin_aarch64_sshll2_nv4si(__a, __b); } -FUNK uint16x8_t vshll_high_n_u8(uint8x16_t __a, const int __b) { +__funline uint16x8_t vshll_high_n_u8(uint8x16_t __a, const int __b) { return (uint16x8_t)__builtin_aarch64_ushll2_nv16qi((int8x16_t)__a, __b); } -FUNK uint32x4_t vshll_high_n_u16(uint16x8_t __a, const int __b) { +__funline uint32x4_t vshll_high_n_u16(uint16x8_t __a, const int __b) { return (uint32x4_t)__builtin_aarch64_ushll2_nv8hi((int16x8_t)__a, __b); } -FUNK uint64x2_t vshll_high_n_u32(uint32x4_t __a, const int __b) { +__funline uint64x2_t vshll_high_n_u32(uint32x4_t __a, const int __b) { return (uint64x2_t)__builtin_aarch64_ushll2_nv4si((int32x4_t)__a, __b); } -FUNK int16x8_t vshll_n_s8(int8x8_t __a, const int __b) { +__funline int16x8_t vshll_n_s8(int8x8_t __a, const int __b) { return __builtin_aarch64_sshll_nv8qi(__a, __b); } -FUNK int32x4_t vshll_n_s16(int16x4_t __a, const int __b) { +__funline int32x4_t vshll_n_s16(int16x4_t __a, const int __b) { return __builtin_aarch64_sshll_nv4hi(__a, __b); } -FUNK int64x2_t vshll_n_s32(int32x2_t __a, const int __b) { +__funline int64x2_t vshll_n_s32(int32x2_t __a, const int __b) { return __builtin_aarch64_sshll_nv2si(__a, __b); } -FUNK uint16x8_t vshll_n_u8(uint8x8_t __a, const int __b) { +__funline uint16x8_t vshll_n_u8(uint8x8_t __a, const int __b) { return __builtin_aarch64_ushll_nv8qi_uus(__a, __b); } -FUNK uint32x4_t vshll_n_u16(uint16x4_t __a, const int __b) { +__funline uint32x4_t vshll_n_u16(uint16x4_t __a, const int __b) { return __builtin_aarch64_ushll_nv4hi_uus(__a, __b); } -FUNK uint64x2_t vshll_n_u32(uint32x2_t __a, const int __b) { +__funline uint64x2_t vshll_n_u32(uint32x2_t __a, const int __b) { return __builtin_aarch64_ushll_nv2si_uus(__a, __b); } -FUNK int8x8_t vshr_n_s8(int8x8_t __a, const int __b) { +__funline int8x8_t vshr_n_s8(int8x8_t __a, const int __b) { return (int8x8_t)__builtin_aarch64_ashrv8qi(__a, __b); } -FUNK int16x4_t vshr_n_s16(int16x4_t __a, const int __b) { +__funline int16x4_t vshr_n_s16(int16x4_t __a, const int __b) { return (int16x4_t)__builtin_aarch64_ashrv4hi(__a, __b); } -FUNK int32x2_t vshr_n_s32(int32x2_t __a, const int __b) { +__funline int32x2_t vshr_n_s32(int32x2_t __a, const int __b) { return (int32x2_t)__builtin_aarch64_ashrv2si(__a, __b); } -FUNK int64x1_t vshr_n_s64(int64x1_t __a, const int __b) { +__funline int64x1_t vshr_n_s64(int64x1_t __a, const int __b) { return (int64x1_t){__builtin_aarch64_ashr_simddi(__a[0], __b)}; } -FUNK uint8x8_t vshr_n_u8(uint8x8_t __a, const int __b) { +__funline uint8x8_t vshr_n_u8(uint8x8_t __a, const int __b) { return (uint8x8_t)__builtin_aarch64_lshrv8qi((int8x8_t)__a, __b); } -FUNK uint16x4_t vshr_n_u16(uint16x4_t __a, const int __b) { +__funline uint16x4_t vshr_n_u16(uint16x4_t __a, const int __b) { return (uint16x4_t)__builtin_aarch64_lshrv4hi((int16x4_t)__a, __b); } -FUNK uint32x2_t vshr_n_u32(uint32x2_t __a, const int __b) { +__funline uint32x2_t vshr_n_u32(uint32x2_t __a, const int __b) { return (uint32x2_t)__builtin_aarch64_lshrv2si((int32x2_t)__a, __b); } -FUNK uint64x1_t vshr_n_u64(uint64x1_t __a, const int __b) { +__funline uint64x1_t vshr_n_u64(uint64x1_t __a, const int __b) { return (uint64x1_t){__builtin_aarch64_lshr_simddi_uus(__a[0], __b)}; } -FUNK int8x16_t vshrq_n_s8(int8x16_t __a, const int __b) { +__funline int8x16_t vshrq_n_s8(int8x16_t __a, const int __b) { return (int8x16_t)__builtin_aarch64_ashrv16qi(__a, __b); } -FUNK int16x8_t vshrq_n_s16(int16x8_t __a, const int __b) { +__funline int16x8_t vshrq_n_s16(int16x8_t __a, const int __b) { return (int16x8_t)__builtin_aarch64_ashrv8hi(__a, __b); } -FUNK int32x4_t vshrq_n_s32(int32x4_t __a, const int __b) { +__funline int32x4_t vshrq_n_s32(int32x4_t __a, const int __b) { return (int32x4_t)__builtin_aarch64_ashrv4si(__a, __b); } -FUNK int64x2_t vshrq_n_s64(int64x2_t __a, const int __b) { +__funline int64x2_t vshrq_n_s64(int64x2_t __a, const int __b) { return (int64x2_t)__builtin_aarch64_ashrv2di(__a, __b); } -FUNK uint8x16_t vshrq_n_u8(uint8x16_t __a, const int __b) { +__funline uint8x16_t vshrq_n_u8(uint8x16_t __a, const int __b) { return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__a, __b); } -FUNK uint16x8_t vshrq_n_u16(uint16x8_t __a, const int __b) { +__funline uint16x8_t vshrq_n_u16(uint16x8_t __a, const int __b) { return (uint16x8_t)__builtin_aarch64_lshrv8hi((int16x8_t)__a, __b); } -FUNK uint32x4_t vshrq_n_u32(uint32x4_t __a, const int __b) { +__funline uint32x4_t vshrq_n_u32(uint32x4_t __a, const int __b) { return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__a, __b); } -FUNK uint64x2_t vshrq_n_u64(uint64x2_t __a, const int __b) { +__funline uint64x2_t vshrq_n_u64(uint64x2_t __a, const int __b) { return (uint64x2_t)__builtin_aarch64_lshrv2di((int64x2_t)__a, __b); } -FUNK int64_t vshrd_n_s64(int64_t __a, const int __b) { +__funline int64_t vshrd_n_s64(int64_t __a, const int __b) { return __builtin_aarch64_ashr_simddi(__a, __b); } -FUNK uint64_t vshrd_n_u64(uint64_t __a, const int __b) { +__funline uint64_t vshrd_n_u64(uint64_t __a, const int __b) { return __builtin_aarch64_lshr_simddi_uus(__a, __b); } -FUNK int8x8_t vsli_n_s8(int8x8_t __a, int8x8_t __b, const int __c) { +__funline int8x8_t vsli_n_s8(int8x8_t __a, int8x8_t __b, const int __c) { return (int8x8_t)__builtin_aarch64_ssli_nv8qi(__a, __b, __c); } -FUNK int16x4_t vsli_n_s16(int16x4_t __a, int16x4_t __b, const int __c) { +__funline int16x4_t vsli_n_s16(int16x4_t __a, int16x4_t __b, const int __c) { return (int16x4_t)__builtin_aarch64_ssli_nv4hi(__a, __b, __c); } -FUNK int32x2_t vsli_n_s32(int32x2_t __a, int32x2_t __b, const int __c) { +__funline int32x2_t vsli_n_s32(int32x2_t __a, int32x2_t __b, const int __c) { return (int32x2_t)__builtin_aarch64_ssli_nv2si(__a, __b, __c); } -FUNK int64x1_t vsli_n_s64(int64x1_t __a, int64x1_t __b, const int __c) { +__funline int64x1_t vsli_n_s64(int64x1_t __a, int64x1_t __b, const int __c) { return (int64x1_t){__builtin_aarch64_ssli_ndi(__a[0], __b[0], __c)}; } -FUNK uint8x8_t vsli_n_u8(uint8x8_t __a, uint8x8_t __b, const int __c) { +__funline uint8x8_t vsli_n_u8(uint8x8_t __a, uint8x8_t __b, const int __c) { return __builtin_aarch64_usli_nv8qi_uuus(__a, __b, __c); } -FUNK uint16x4_t vsli_n_u16(uint16x4_t __a, uint16x4_t __b, const int __c) { +__funline uint16x4_t vsli_n_u16(uint16x4_t __a, uint16x4_t __b, const int __c) { return __builtin_aarch64_usli_nv4hi_uuus(__a, __b, __c); } -FUNK uint32x2_t vsli_n_u32(uint32x2_t __a, uint32x2_t __b, const int __c) { +__funline uint32x2_t vsli_n_u32(uint32x2_t __a, uint32x2_t __b, const int __c) { return __builtin_aarch64_usli_nv2si_uuus(__a, __b, __c); } -FUNK uint64x1_t vsli_n_u64(uint64x1_t __a, uint64x1_t __b, const int __c) { +__funline uint64x1_t vsli_n_u64(uint64x1_t __a, uint64x1_t __b, const int __c) { return (uint64x1_t){__builtin_aarch64_usli_ndi_uuus(__a[0], __b[0], __c)}; } -FUNK poly64x1_t vsli_n_p64(poly64x1_t __a, poly64x1_t __b, const int __c) { +__funline poly64x1_t vsli_n_p64(poly64x1_t __a, poly64x1_t __b, const int __c) { return (poly64x1_t){__builtin_aarch64_ssli_ndi_ppps(__a[0], __b[0], __c)}; } -FUNK int8x16_t vsliq_n_s8(int8x16_t __a, int8x16_t __b, const int __c) { +__funline int8x16_t vsliq_n_s8(int8x16_t __a, int8x16_t __b, const int __c) { return (int8x16_t)__builtin_aarch64_ssli_nv16qi(__a, __b, __c); } -FUNK int16x8_t vsliq_n_s16(int16x8_t __a, int16x8_t __b, const int __c) { +__funline int16x8_t vsliq_n_s16(int16x8_t __a, int16x8_t __b, const int __c) { return (int16x8_t)__builtin_aarch64_ssli_nv8hi(__a, __b, __c); } -FUNK int32x4_t vsliq_n_s32(int32x4_t __a, int32x4_t __b, const int __c) { +__funline int32x4_t vsliq_n_s32(int32x4_t __a, int32x4_t __b, const int __c) { return (int32x4_t)__builtin_aarch64_ssli_nv4si(__a, __b, __c); } -FUNK int64x2_t vsliq_n_s64(int64x2_t __a, int64x2_t __b, const int __c) { +__funline int64x2_t vsliq_n_s64(int64x2_t __a, int64x2_t __b, const int __c) { return (int64x2_t)__builtin_aarch64_ssli_nv2di(__a, __b, __c); } -FUNK uint8x16_t vsliq_n_u8(uint8x16_t __a, uint8x16_t __b, const int __c) { +__funline uint8x16_t vsliq_n_u8(uint8x16_t __a, uint8x16_t __b, const int __c) { return __builtin_aarch64_usli_nv16qi_uuus(__a, __b, __c); } -FUNK uint16x8_t vsliq_n_u16(uint16x8_t __a, uint16x8_t __b, const int __c) { +__funline uint16x8_t vsliq_n_u16(uint16x8_t __a, uint16x8_t __b, + const int __c) { return __builtin_aarch64_usli_nv8hi_uuus(__a, __b, __c); } -FUNK uint32x4_t vsliq_n_u32(uint32x4_t __a, uint32x4_t __b, const int __c) { +__funline uint32x4_t vsliq_n_u32(uint32x4_t __a, uint32x4_t __b, + const int __c) { return __builtin_aarch64_usli_nv4si_uuus(__a, __b, __c); } -FUNK uint64x2_t vsliq_n_u64(uint64x2_t __a, uint64x2_t __b, const int __c) { +__funline uint64x2_t vsliq_n_u64(uint64x2_t __a, uint64x2_t __b, + const int __c) { return __builtin_aarch64_usli_nv2di_uuus(__a, __b, __c); } -FUNK poly64x2_t vsliq_n_p64(poly64x2_t __a, poly64x2_t __b, const int __c) { +__funline poly64x2_t vsliq_n_p64(poly64x2_t __a, poly64x2_t __b, + const int __c) { return __builtin_aarch64_ssli_nv2di_ppps(__a, __b, __c); } -FUNK int64_t vslid_n_s64(int64_t __a, int64_t __b, const int __c) { +__funline int64_t vslid_n_s64(int64_t __a, int64_t __b, const int __c) { return __builtin_aarch64_ssli_ndi(__a, __b, __c); } -FUNK uint64_t vslid_n_u64(uint64_t __a, uint64_t __b, const int __c) { +__funline uint64_t vslid_n_u64(uint64_t __a, uint64_t __b, const int __c) { return __builtin_aarch64_usli_ndi_uuus(__a, __b, __c); } -FUNK uint8x8_t vsqadd_u8(uint8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vsqadd_u8(uint8x8_t __a, int8x8_t __b) { return __builtin_aarch64_usqaddv8qi_uus(__a, __b); } -FUNK uint16x4_t vsqadd_u16(uint16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vsqadd_u16(uint16x4_t __a, int16x4_t __b) { return __builtin_aarch64_usqaddv4hi_uus(__a, __b); } -FUNK uint32x2_t vsqadd_u32(uint32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vsqadd_u32(uint32x2_t __a, int32x2_t __b) { return __builtin_aarch64_usqaddv2si_uus(__a, __b); } -FUNK uint64x1_t vsqadd_u64(uint64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vsqadd_u64(uint64x1_t __a, int64x1_t __b) { return (uint64x1_t){__builtin_aarch64_usqadddi_uus(__a[0], __b[0])}; } -FUNK uint8x16_t vsqaddq_u8(uint8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vsqaddq_u8(uint8x16_t __a, int8x16_t __b) { return __builtin_aarch64_usqaddv16qi_uus(__a, __b); } -FUNK uint16x8_t vsqaddq_u16(uint16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vsqaddq_u16(uint16x8_t __a, int16x8_t __b) { return __builtin_aarch64_usqaddv8hi_uus(__a, __b); } -FUNK uint32x4_t vsqaddq_u32(uint32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vsqaddq_u32(uint32x4_t __a, int32x4_t __b) { return __builtin_aarch64_usqaddv4si_uus(__a, __b); } -FUNK uint64x2_t vsqaddq_u64(uint64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vsqaddq_u64(uint64x2_t __a, int64x2_t __b) { return __builtin_aarch64_usqaddv2di_uus(__a, __b); } -FUNK uint8_t vsqaddb_u8(uint8_t __a, int8_t __b) { +__funline uint8_t vsqaddb_u8(uint8_t __a, int8_t __b) { return __builtin_aarch64_usqaddqi_uus(__a, __b); } -FUNK uint16_t vsqaddh_u16(uint16_t __a, int16_t __b) { +__funline uint16_t vsqaddh_u16(uint16_t __a, int16_t __b) { return __builtin_aarch64_usqaddhi_uus(__a, __b); } -FUNK uint32_t vsqadds_u32(uint32_t __a, int32_t __b) { +__funline uint32_t vsqadds_u32(uint32_t __a, int32_t __b) { return __builtin_aarch64_usqaddsi_uus(__a, __b); } -FUNK uint64_t vsqaddd_u64(uint64_t __a, int64_t __b) { +__funline uint64_t vsqaddd_u64(uint64_t __a, int64_t __b) { return __builtin_aarch64_usqadddi_uus(__a, __b); } -FUNK float32x2_t vsqrt_f32(float32x2_t a) { +__funline float32x2_t vsqrt_f32(float32x2_t a) { return __builtin_aarch64_sqrtv2sf(a); } -FUNK float32x4_t vsqrtq_f32(float32x4_t a) { +__funline float32x4_t vsqrtq_f32(float32x4_t a) { return __builtin_aarch64_sqrtv4sf(a); } -FUNK float64x1_t vsqrt_f64(float64x1_t a) { +__funline float64x1_t vsqrt_f64(float64x1_t a) { return (float64x1_t){__builtin_aarch64_sqrtdf(a[0])}; } -FUNK float64x2_t vsqrtq_f64(float64x2_t a) { +__funline float64x2_t vsqrtq_f64(float64x2_t a) { return __builtin_aarch64_sqrtv2df(a); } -FUNK int8x8_t vsra_n_s8(int8x8_t __a, int8x8_t __b, const int __c) { +__funline int8x8_t vsra_n_s8(int8x8_t __a, int8x8_t __b, const int __c) { return (int8x8_t)__builtin_aarch64_ssra_nv8qi(__a, __b, __c); } -FUNK int16x4_t vsra_n_s16(int16x4_t __a, int16x4_t __b, const int __c) { +__funline int16x4_t vsra_n_s16(int16x4_t __a, int16x4_t __b, const int __c) { return (int16x4_t)__builtin_aarch64_ssra_nv4hi(__a, __b, __c); } -FUNK int32x2_t vsra_n_s32(int32x2_t __a, int32x2_t __b, const int __c) { +__funline int32x2_t vsra_n_s32(int32x2_t __a, int32x2_t __b, const int __c) { return (int32x2_t)__builtin_aarch64_ssra_nv2si(__a, __b, __c); } -FUNK int64x1_t vsra_n_s64(int64x1_t __a, int64x1_t __b, const int __c) { +__funline int64x1_t vsra_n_s64(int64x1_t __a, int64x1_t __b, const int __c) { return (int64x1_t){__builtin_aarch64_ssra_ndi(__a[0], __b[0], __c)}; } -FUNK uint8x8_t vsra_n_u8(uint8x8_t __a, uint8x8_t __b, const int __c) { +__funline uint8x8_t vsra_n_u8(uint8x8_t __a, uint8x8_t __b, const int __c) { return __builtin_aarch64_usra_nv8qi_uuus(__a, __b, __c); } -FUNK uint16x4_t vsra_n_u16(uint16x4_t __a, uint16x4_t __b, const int __c) { +__funline uint16x4_t vsra_n_u16(uint16x4_t __a, uint16x4_t __b, const int __c) { return __builtin_aarch64_usra_nv4hi_uuus(__a, __b, __c); } -FUNK uint32x2_t vsra_n_u32(uint32x2_t __a, uint32x2_t __b, const int __c) { +__funline uint32x2_t vsra_n_u32(uint32x2_t __a, uint32x2_t __b, const int __c) { return __builtin_aarch64_usra_nv2si_uuus(__a, __b, __c); } -FUNK uint64x1_t vsra_n_u64(uint64x1_t __a, uint64x1_t __b, const int __c) { +__funline uint64x1_t vsra_n_u64(uint64x1_t __a, uint64x1_t __b, const int __c) { return (uint64x1_t){__builtin_aarch64_usra_ndi_uuus(__a[0], __b[0], __c)}; } -FUNK int8x16_t vsraq_n_s8(int8x16_t __a, int8x16_t __b, const int __c) { +__funline int8x16_t vsraq_n_s8(int8x16_t __a, int8x16_t __b, const int __c) { return (int8x16_t)__builtin_aarch64_ssra_nv16qi(__a, __b, __c); } -FUNK int16x8_t vsraq_n_s16(int16x8_t __a, int16x8_t __b, const int __c) { +__funline int16x8_t vsraq_n_s16(int16x8_t __a, int16x8_t __b, const int __c) { return (int16x8_t)__builtin_aarch64_ssra_nv8hi(__a, __b, __c); } -FUNK int32x4_t vsraq_n_s32(int32x4_t __a, int32x4_t __b, const int __c) { +__funline int32x4_t vsraq_n_s32(int32x4_t __a, int32x4_t __b, const int __c) { return (int32x4_t)__builtin_aarch64_ssra_nv4si(__a, __b, __c); } -FUNK int64x2_t vsraq_n_s64(int64x2_t __a, int64x2_t __b, const int __c) { +__funline int64x2_t vsraq_n_s64(int64x2_t __a, int64x2_t __b, const int __c) { return (int64x2_t)__builtin_aarch64_ssra_nv2di(__a, __b, __c); } -FUNK uint8x16_t vsraq_n_u8(uint8x16_t __a, uint8x16_t __b, const int __c) { +__funline uint8x16_t vsraq_n_u8(uint8x16_t __a, uint8x16_t __b, const int __c) { return __builtin_aarch64_usra_nv16qi_uuus(__a, __b, __c); } -FUNK uint16x8_t vsraq_n_u16(uint16x8_t __a, uint16x8_t __b, const int __c) { +__funline uint16x8_t vsraq_n_u16(uint16x8_t __a, uint16x8_t __b, + const int __c) { return __builtin_aarch64_usra_nv8hi_uuus(__a, __b, __c); } -FUNK uint32x4_t vsraq_n_u32(uint32x4_t __a, uint32x4_t __b, const int __c) { +__funline uint32x4_t vsraq_n_u32(uint32x4_t __a, uint32x4_t __b, + const int __c) { return __builtin_aarch64_usra_nv4si_uuus(__a, __b, __c); } -FUNK uint64x2_t vsraq_n_u64(uint64x2_t __a, uint64x2_t __b, const int __c) { +__funline uint64x2_t vsraq_n_u64(uint64x2_t __a, uint64x2_t __b, + const int __c) { return __builtin_aarch64_usra_nv2di_uuus(__a, __b, __c); } -FUNK int64_t vsrad_n_s64(int64_t __a, int64_t __b, const int __c) { +__funline int64_t vsrad_n_s64(int64_t __a, int64_t __b, const int __c) { return __builtin_aarch64_ssra_ndi(__a, __b, __c); } -FUNK uint64_t vsrad_n_u64(uint64_t __a, uint64_t __b, const int __c) { +__funline uint64_t vsrad_n_u64(uint64_t __a, uint64_t __b, const int __c) { return __builtin_aarch64_usra_ndi_uuus(__a, __b, __c); } -FUNK int8x8_t vsri_n_s8(int8x8_t __a, int8x8_t __b, const int __c) { +__funline int8x8_t vsri_n_s8(int8x8_t __a, int8x8_t __b, const int __c) { return (int8x8_t)__builtin_aarch64_ssri_nv8qi(__a, __b, __c); } -FUNK int16x4_t vsri_n_s16(int16x4_t __a, int16x4_t __b, const int __c) { +__funline int16x4_t vsri_n_s16(int16x4_t __a, int16x4_t __b, const int __c) { return (int16x4_t)__builtin_aarch64_ssri_nv4hi(__a, __b, __c); } -FUNK int32x2_t vsri_n_s32(int32x2_t __a, int32x2_t __b, const int __c) { +__funline int32x2_t vsri_n_s32(int32x2_t __a, int32x2_t __b, const int __c) { return (int32x2_t)__builtin_aarch64_ssri_nv2si(__a, __b, __c); } -FUNK int64x1_t vsri_n_s64(int64x1_t __a, int64x1_t __b, const int __c) { +__funline int64x1_t vsri_n_s64(int64x1_t __a, int64x1_t __b, const int __c) { return (int64x1_t){__builtin_aarch64_ssri_ndi(__a[0], __b[0], __c)}; } -FUNK uint8x8_t vsri_n_u8(uint8x8_t __a, uint8x8_t __b, const int __c) { +__funline uint8x8_t vsri_n_u8(uint8x8_t __a, uint8x8_t __b, const int __c) { return __builtin_aarch64_usri_nv8qi_uuus(__a, __b, __c); } -FUNK uint16x4_t vsri_n_u16(uint16x4_t __a, uint16x4_t __b, const int __c) { +__funline uint16x4_t vsri_n_u16(uint16x4_t __a, uint16x4_t __b, const int __c) { return __builtin_aarch64_usri_nv4hi_uuus(__a, __b, __c); } -FUNK uint32x2_t vsri_n_u32(uint32x2_t __a, uint32x2_t __b, const int __c) { +__funline uint32x2_t vsri_n_u32(uint32x2_t __a, uint32x2_t __b, const int __c) { return __builtin_aarch64_usri_nv2si_uuus(__a, __b, __c); } -FUNK uint64x1_t vsri_n_u64(uint64x1_t __a, uint64x1_t __b, const int __c) { +__funline uint64x1_t vsri_n_u64(uint64x1_t __a, uint64x1_t __b, const int __c) { return (uint64x1_t){__builtin_aarch64_usri_ndi_uuus(__a[0], __b[0], __c)}; } -FUNK int8x16_t vsriq_n_s8(int8x16_t __a, int8x16_t __b, const int __c) { +__funline int8x16_t vsriq_n_s8(int8x16_t __a, int8x16_t __b, const int __c) { return (int8x16_t)__builtin_aarch64_ssri_nv16qi(__a, __b, __c); } -FUNK int16x8_t vsriq_n_s16(int16x8_t __a, int16x8_t __b, const int __c) { +__funline int16x8_t vsriq_n_s16(int16x8_t __a, int16x8_t __b, const int __c) { return (int16x8_t)__builtin_aarch64_ssri_nv8hi(__a, __b, __c); } -FUNK int32x4_t vsriq_n_s32(int32x4_t __a, int32x4_t __b, const int __c) { +__funline int32x4_t vsriq_n_s32(int32x4_t __a, int32x4_t __b, const int __c) { return (int32x4_t)__builtin_aarch64_ssri_nv4si(__a, __b, __c); } -FUNK int64x2_t vsriq_n_s64(int64x2_t __a, int64x2_t __b, const int __c) { +__funline int64x2_t vsriq_n_s64(int64x2_t __a, int64x2_t __b, const int __c) { return (int64x2_t)__builtin_aarch64_ssri_nv2di(__a, __b, __c); } -FUNK uint8x16_t vsriq_n_u8(uint8x16_t __a, uint8x16_t __b, const int __c) { +__funline uint8x16_t vsriq_n_u8(uint8x16_t __a, uint8x16_t __b, const int __c) { return __builtin_aarch64_usri_nv16qi_uuus(__a, __b, __c); } -FUNK uint16x8_t vsriq_n_u16(uint16x8_t __a, uint16x8_t __b, const int __c) { +__funline uint16x8_t vsriq_n_u16(uint16x8_t __a, uint16x8_t __b, + const int __c) { return __builtin_aarch64_usri_nv8hi_uuus(__a, __b, __c); } -FUNK uint32x4_t vsriq_n_u32(uint32x4_t __a, uint32x4_t __b, const int __c) { +__funline uint32x4_t vsriq_n_u32(uint32x4_t __a, uint32x4_t __b, + const int __c) { return __builtin_aarch64_usri_nv4si_uuus(__a, __b, __c); } -FUNK uint64x2_t vsriq_n_u64(uint64x2_t __a, uint64x2_t __b, const int __c) { +__funline uint64x2_t vsriq_n_u64(uint64x2_t __a, uint64x2_t __b, + const int __c) { return __builtin_aarch64_usri_nv2di_uuus(__a, __b, __c); } -FUNK int64_t vsrid_n_s64(int64_t __a, int64_t __b, const int __c) { +__funline int64_t vsrid_n_s64(int64_t __a, int64_t __b, const int __c) { return __builtin_aarch64_ssri_ndi(__a, __b, __c); } -FUNK uint64_t vsrid_n_u64(uint64_t __a, uint64_t __b, const int __c) { +__funline uint64_t vsrid_n_u64(uint64_t __a, uint64_t __b, const int __c) { return __builtin_aarch64_usri_ndi_uuus(__a, __b, __c); } -FUNK void vst1_f16(float16_t *__a, float16x4_t __b) { +__funline void vst1_f16(float16_t *__a, float16x4_t __b) { __builtin_aarch64_st1v4hf(__a, __b); } -FUNK void vst1_f32(float32_t *a, float32x2_t b) { +__funline void vst1_f32(float32_t *a, float32x2_t b) { __builtin_aarch64_st1v2sf((__builtin_aarch64_simd_sf *)a, b); } -FUNK void vst1_f64(float64_t *a, float64x1_t b) { +__funline void vst1_f64(float64_t *a, float64x1_t b) { *a = b[0]; } -FUNK void vst1_p8(poly8_t *a, poly8x8_t b) { +__funline void vst1_p8(poly8_t *a, poly8x8_t b) { __builtin_aarch64_st1v8qi((__builtin_aarch64_simd_qi *)a, (int8x8_t)b); } -FUNK void vst1_p16(poly16_t *a, poly16x4_t b) { +__funline void vst1_p16(poly16_t *a, poly16x4_t b) { __builtin_aarch64_st1v4hi((__builtin_aarch64_simd_hi *)a, (int16x4_t)b); } -FUNK void vst1_p64(poly64_t *a, poly64x1_t b) { +__funline void vst1_p64(poly64_t *a, poly64x1_t b) { *a = b[0]; } -FUNK void vst1_s8(int8_t *a, int8x8_t b) { +__funline void vst1_s8(int8_t *a, int8x8_t b) { __builtin_aarch64_st1v8qi((__builtin_aarch64_simd_qi *)a, b); } -FUNK void vst1_s16(int16_t *a, int16x4_t b) { +__funline void vst1_s16(int16_t *a, int16x4_t b) { __builtin_aarch64_st1v4hi((__builtin_aarch64_simd_hi *)a, b); } -FUNK void vst1_s32(int32_t *a, int32x2_t b) { +__funline void vst1_s32(int32_t *a, int32x2_t b) { __builtin_aarch64_st1v2si((__builtin_aarch64_simd_si *)a, b); } -FUNK void vst1_s64(int64_t *a, int64x1_t b) { +__funline void vst1_s64(int64_t *a, int64x1_t b) { *a = b[0]; } -FUNK void vst1_u8(uint8_t *a, uint8x8_t b) { +__funline void vst1_u8(uint8_t *a, uint8x8_t b) { __builtin_aarch64_st1v8qi((__builtin_aarch64_simd_qi *)a, (int8x8_t)b); } -FUNK void vst1_u16(uint16_t *a, uint16x4_t b) { +__funline void vst1_u16(uint16_t *a, uint16x4_t b) { __builtin_aarch64_st1v4hi((__builtin_aarch64_simd_hi *)a, (int16x4_t)b); } -FUNK void vst1_u32(uint32_t *a, uint32x2_t b) { +__funline void vst1_u32(uint32_t *a, uint32x2_t b) { __builtin_aarch64_st1v2si((__builtin_aarch64_simd_si *)a, (int32x2_t)b); } -FUNK void vst1_u64(uint64_t *a, uint64x1_t b) { +__funline void vst1_u64(uint64_t *a, uint64x1_t b) { *a = b[0]; } -FUNK void vst1q_f16(float16_t *__a, float16x8_t __b) { +__funline void vst1q_f16(float16_t *__a, float16x8_t __b) { __builtin_aarch64_st1v8hf(__a, __b); } -FUNK void vst1q_f32(float32_t *a, float32x4_t b) { +__funline void vst1q_f32(float32_t *a, float32x4_t b) { __builtin_aarch64_st1v4sf((__builtin_aarch64_simd_sf *)a, b); } -FUNK void vst1q_f64(float64_t *a, float64x2_t b) { +__funline void vst1q_f64(float64_t *a, float64x2_t b) { __builtin_aarch64_st1v2df((__builtin_aarch64_simd_df *)a, b); } -FUNK void vst1q_p8(poly8_t *a, poly8x16_t b) { +__funline void vst1q_p8(poly8_t *a, poly8x16_t b) { __builtin_aarch64_st1v16qi((__builtin_aarch64_simd_qi *)a, (int8x16_t)b); } -FUNK void vst1q_p16(poly16_t *a, poly16x8_t b) { +__funline void vst1q_p16(poly16_t *a, poly16x8_t b) { __builtin_aarch64_st1v8hi((__builtin_aarch64_simd_hi *)a, (int16x8_t)b); } -FUNK void vst1q_p64(poly64_t *a, poly64x2_t b) { +__funline void vst1q_p64(poly64_t *a, poly64x2_t b) { __builtin_aarch64_st1v2di_sp((__builtin_aarch64_simd_di *)a, (poly64x2_t)b); } -FUNK void vst1q_s8(int8_t *a, int8x16_t b) { +__funline void vst1q_s8(int8_t *a, int8x16_t b) { __builtin_aarch64_st1v16qi((__builtin_aarch64_simd_qi *)a, b); } -FUNK void vst1q_s16(int16_t *a, int16x8_t b) { +__funline void vst1q_s16(int16_t *a, int16x8_t b) { __builtin_aarch64_st1v8hi((__builtin_aarch64_simd_hi *)a, b); } -FUNK void vst1q_s32(int32_t *a, int32x4_t b) { +__funline void vst1q_s32(int32_t *a, int32x4_t b) { __builtin_aarch64_st1v4si((__builtin_aarch64_simd_si *)a, b); } -FUNK void vst1q_s64(int64_t *a, int64x2_t b) { +__funline void vst1q_s64(int64_t *a, int64x2_t b) { __builtin_aarch64_st1v2di((__builtin_aarch64_simd_di *)a, b); } -FUNK void vst1q_u8(uint8_t *a, uint8x16_t b) { +__funline void vst1q_u8(uint8_t *a, uint8x16_t b) { __builtin_aarch64_st1v16qi((__builtin_aarch64_simd_qi *)a, (int8x16_t)b); } -FUNK void vst1q_u16(uint16_t *a, uint16x8_t b) { +__funline void vst1q_u16(uint16_t *a, uint16x8_t b) { __builtin_aarch64_st1v8hi((__builtin_aarch64_simd_hi *)a, (int16x8_t)b); } -FUNK void vst1q_u32(uint32_t *a, uint32x4_t b) { +__funline void vst1q_u32(uint32_t *a, uint32x4_t b) { __builtin_aarch64_st1v4si((__builtin_aarch64_simd_si *)a, (int32x4_t)b); } -FUNK void vst1q_u64(uint64_t *a, uint64x2_t b) { +__funline void vst1q_u64(uint64_t *a, uint64x2_t b) { __builtin_aarch64_st1v2di((__builtin_aarch64_simd_di *)a, (int64x2_t)b); } -FUNK void vst1_lane_f16(float16_t *__a, float16x4_t __b, const int __lane) { +__funline void vst1_lane_f16(float16_t *__a, float16x4_t __b, + const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_f32(float32_t *__a, float32x2_t __b, const int __lane) { +__funline void vst1_lane_f32(float32_t *__a, float32x2_t __b, + const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_f64(float64_t *__a, float64x1_t __b, const int __lane) { +__funline void vst1_lane_f64(float64_t *__a, float64x1_t __b, + const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_p8(poly8_t *__a, poly8x8_t __b, const int __lane) { +__funline void vst1_lane_p8(poly8_t *__a, poly8x8_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_p16(poly16_t *__a, poly16x4_t __b, const int __lane) { +__funline void vst1_lane_p16(poly16_t *__a, poly16x4_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_p64(poly64_t *__a, poly64x1_t __b, const int __lane) { +__funline void vst1_lane_p64(poly64_t *__a, poly64x1_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_s8(int8_t *__a, int8x8_t __b, const int __lane) { +__funline void vst1_lane_s8(int8_t *__a, int8x8_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_s16(int16_t *__a, int16x4_t __b, const int __lane) { +__funline void vst1_lane_s16(int16_t *__a, int16x4_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_s32(int32_t *__a, int32x2_t __b, const int __lane) { +__funline void vst1_lane_s32(int32_t *__a, int32x2_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_s64(int64_t *__a, int64x1_t __b, const int __lane) { +__funline void vst1_lane_s64(int64_t *__a, int64x1_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_u8(uint8_t *__a, uint8x8_t __b, const int __lane) { +__funline void vst1_lane_u8(uint8_t *__a, uint8x8_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_u16(uint16_t *__a, uint16x4_t __b, const int __lane) { +__funline void vst1_lane_u16(uint16_t *__a, uint16x4_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_u32(uint32_t *__a, uint32x2_t __b, const int __lane) { +__funline void vst1_lane_u32(uint32_t *__a, uint32x2_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_lane_u64(uint64_t *__a, uint64x1_t __b, const int __lane) { +__funline void vst1_lane_u64(uint64_t *__a, uint64x1_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_f16(float16_t *__a, float16x8_t __b, const int __lane) { +__funline void vst1q_lane_f16(float16_t *__a, float16x8_t __b, + const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_f32(float32_t *__a, float32x4_t __b, const int __lane) { +__funline void vst1q_lane_f32(float32_t *__a, float32x4_t __b, + const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_f64(float64_t *__a, float64x2_t __b, const int __lane) { +__funline void vst1q_lane_f64(float64_t *__a, float64x2_t __b, + const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_p8(poly8_t *__a, poly8x16_t __b, const int __lane) { +__funline void vst1q_lane_p8(poly8_t *__a, poly8x16_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_p16(poly16_t *__a, poly16x8_t __b, const int __lane) { +__funline void vst1q_lane_p16(poly16_t *__a, poly16x8_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_p64(poly64_t *__a, poly64x2_t __b, const int __lane) { +__funline void vst1q_lane_p64(poly64_t *__a, poly64x2_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_s8(int8_t *__a, int8x16_t __b, const int __lane) { +__funline void vst1q_lane_s8(int8_t *__a, int8x16_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_s16(int16_t *__a, int16x8_t __b, const int __lane) { +__funline void vst1q_lane_s16(int16_t *__a, int16x8_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_s32(int32_t *__a, int32x4_t __b, const int __lane) { +__funline void vst1q_lane_s32(int32_t *__a, int32x4_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_s64(int64_t *__a, int64x2_t __b, const int __lane) { +__funline void vst1q_lane_s64(int64_t *__a, int64x2_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_u8(uint8_t *__a, uint8x16_t __b, const int __lane) { +__funline void vst1q_lane_u8(uint8_t *__a, uint8x16_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_u16(uint16_t *__a, uint16x8_t __b, const int __lane) { +__funline void vst1q_lane_u16(uint16_t *__a, uint16x8_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_u32(uint32_t *__a, uint32x4_t __b, const int __lane) { +__funline void vst1q_lane_u32(uint32_t *__a, uint32x4_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1q_lane_u64(uint64_t *__a, uint64x2_t __b, const int __lane) { +__funline void vst1q_lane_u64(uint64_t *__a, uint64x2_t __b, const int __lane) { *__a = __aarch64_vget_lane_any(__b, __lane); } -FUNK void vst1_s64_x2(int64_t *__a, int64x1x2_t val) { +__funline void vst1_s64_x2(int64_t *__a, int64x1x2_t val) { __builtin_aarch64_simd_oi __o; int64x2x2_t temp; temp.val[0] = vcombine_s64(val.val[0], vcreate_s64(__AARCH64_INT64_C(0))); @@ -17910,7 +18027,7 @@ FUNK void vst1_s64_x2(int64_t *__a, int64x1x2_t val) { __builtin_aarch64_st1x2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1_u64_x2(uint64_t *__a, uint64x1x2_t val) { +__funline void vst1_u64_x2(uint64_t *__a, uint64x1x2_t val) { __builtin_aarch64_simd_oi __o; uint64x2x2_t temp; temp.val[0] = vcombine_u64(val.val[0], vcreate_u64(__AARCH64_UINT64_C(0))); @@ -17920,7 +18037,7 @@ FUNK void vst1_u64_x2(uint64_t *__a, uint64x1x2_t val) { __builtin_aarch64_st1x2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1_f64_x2(float64_t *__a, float64x1x2_t val) { +__funline void vst1_f64_x2(float64_t *__a, float64x1x2_t val) { __builtin_aarch64_simd_oi __o; float64x2x2_t temp; temp.val[0] = vcombine_f64(val.val[0], vcreate_f64(__AARCH64_UINT64_C(0))); @@ -17930,7 +18047,7 @@ FUNK void vst1_f64_x2(float64_t *__a, float64x1x2_t val) { __builtin_aarch64_st1x2df((__builtin_aarch64_simd_df *)__a, __o); } -FUNK void vst1_s8_x2(int8_t *__a, int8x8x2_t val) { +__funline void vst1_s8_x2(int8_t *__a, int8x8x2_t val) { __builtin_aarch64_simd_oi __o; int8x16x2_t temp; temp.val[0] = vcombine_s8(val.val[0], vcreate_s8(__AARCH64_INT64_C(0))); @@ -17940,7 +18057,7 @@ FUNK void vst1_s8_x2(int8_t *__a, int8x8x2_t val) { __builtin_aarch64_st1x2v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1_p8_x2(poly8_t *__a, poly8x8x2_t val) { +__funline void vst1_p8_x2(poly8_t *__a, poly8x8x2_t val) { __builtin_aarch64_simd_oi __o; poly8x16x2_t temp; temp.val[0] = vcombine_p8(val.val[0], vcreate_p8(__AARCH64_UINT64_C(0))); @@ -17950,7 +18067,7 @@ FUNK void vst1_p8_x2(poly8_t *__a, poly8x8x2_t val) { __builtin_aarch64_st1x2v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1_s16_x2(int16_t *__a, int16x4x2_t val) { +__funline void vst1_s16_x2(int16_t *__a, int16x4x2_t val) { __builtin_aarch64_simd_oi __o; int16x8x2_t temp; temp.val[0] = vcombine_s16(val.val[0], vcreate_s16(__AARCH64_INT64_C(0))); @@ -17960,7 +18077,7 @@ FUNK void vst1_s16_x2(int16_t *__a, int16x4x2_t val) { __builtin_aarch64_st1x2v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1_p16_x2(poly16_t *__a, poly16x4x2_t val) { +__funline void vst1_p16_x2(poly16_t *__a, poly16x4x2_t val) { __builtin_aarch64_simd_oi __o; poly16x8x2_t temp; temp.val[0] = vcombine_p16(val.val[0], vcreate_p16(__AARCH64_UINT64_C(0))); @@ -17970,7 +18087,7 @@ FUNK void vst1_p16_x2(poly16_t *__a, poly16x4x2_t val) { __builtin_aarch64_st1x2v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1_s32_x2(int32_t *__a, int32x2x2_t val) { +__funline void vst1_s32_x2(int32_t *__a, int32x2x2_t val) { __builtin_aarch64_simd_oi __o; int32x4x2_t temp; temp.val[0] = vcombine_s32(val.val[0], vcreate_s32(__AARCH64_INT64_C(0))); @@ -17980,7 +18097,7 @@ FUNK void vst1_s32_x2(int32_t *__a, int32x2x2_t val) { __builtin_aarch64_st1x2v2si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst1_u8_x2(uint8_t *__a, uint8x8x2_t val) { +__funline void vst1_u8_x2(uint8_t *__a, uint8x8x2_t val) { __builtin_aarch64_simd_oi __o; uint8x16x2_t temp; temp.val[0] = vcombine_u8(val.val[0], vcreate_u8(__AARCH64_UINT64_C(0))); @@ -17990,7 +18107,7 @@ FUNK void vst1_u8_x2(uint8_t *__a, uint8x8x2_t val) { __builtin_aarch64_st1x2v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1_u16_x2(uint16_t *__a, uint16x4x2_t val) { +__funline void vst1_u16_x2(uint16_t *__a, uint16x4x2_t val) { __builtin_aarch64_simd_oi __o; uint16x8x2_t temp; temp.val[0] = vcombine_u16(val.val[0], vcreate_u16(__AARCH64_UINT64_C(0))); @@ -18000,7 +18117,7 @@ FUNK void vst1_u16_x2(uint16_t *__a, uint16x4x2_t val) { __builtin_aarch64_st1x2v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1_u32_x2(uint32_t *__a, uint32x2x2_t val) { +__funline void vst1_u32_x2(uint32_t *__a, uint32x2x2_t val) { __builtin_aarch64_simd_oi __o; uint32x4x2_t temp; temp.val[0] = vcombine_u32(val.val[0], vcreate_u32(__AARCH64_UINT64_C(0))); @@ -18010,7 +18127,7 @@ FUNK void vst1_u32_x2(uint32_t *__a, uint32x2x2_t val) { __builtin_aarch64_st1x2v2si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst1_f16_x2(float16_t *__a, float16x4x2_t val) { +__funline void vst1_f16_x2(float16_t *__a, float16x4x2_t val) { __builtin_aarch64_simd_oi __o; float16x8x2_t temp; temp.val[0] = vcombine_f16(val.val[0], vcreate_f16(__AARCH64_UINT64_C(0))); @@ -18020,7 +18137,7 @@ FUNK void vst1_f16_x2(float16_t *__a, float16x4x2_t val) { __builtin_aarch64_st1x2v4hf(__a, __o); } -FUNK void vst1_f32_x2(float32_t *__a, float32x2x2_t val) { +__funline void vst1_f32_x2(float32_t *__a, float32x2x2_t val) { __builtin_aarch64_simd_oi __o; float32x4x2_t temp; temp.val[0] = vcombine_f32(val.val[0], vcreate_f32(__AARCH64_UINT64_C(0))); @@ -18030,7 +18147,7 @@ FUNK void vst1_f32_x2(float32_t *__a, float32x2x2_t val) { __builtin_aarch64_st1x2v2sf((__builtin_aarch64_simd_sf *)__a, __o); } -FUNK void vst1_p64_x2(poly64_t *__a, poly64x1x2_t val) { +__funline void vst1_p64_x2(poly64_t *__a, poly64x1x2_t val) { __builtin_aarch64_simd_oi __o; poly64x2x2_t temp; temp.val[0] = vcombine_p64(val.val[0], vcreate_p64(__AARCH64_UINT64_C(0))); @@ -18040,105 +18157,105 @@ FUNK void vst1_p64_x2(poly64_t *__a, poly64x1x2_t val) { __builtin_aarch64_st1x2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1q_s8_x2(int8_t *__a, int8x16x2_t val) { +__funline void vst1q_s8_x2(int8_t *__a, int8x16x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[1], 1); __builtin_aarch64_st1x2v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1q_p8_x2(poly8_t *__a, poly8x16x2_t val) { +__funline void vst1q_p8_x2(poly8_t *__a, poly8x16x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[1], 1); __builtin_aarch64_st1x2v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1q_s16_x2(int16_t *__a, int16x8x2_t val) { +__funline void vst1q_s16_x2(int16_t *__a, int16x8x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[1], 1); __builtin_aarch64_st1x2v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1q_p16_x2(poly16_t *__a, poly16x8x2_t val) { +__funline void vst1q_p16_x2(poly16_t *__a, poly16x8x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[1], 1); __builtin_aarch64_st1x2v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1q_s32_x2(int32_t *__a, int32x4x2_t val) { +__funline void vst1q_s32_x2(int32_t *__a, int32x4x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)val.val[1], 1); __builtin_aarch64_st1x2v4si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst1q_s64_x2(int64_t *__a, int64x2x2_t val) { +__funline void vst1q_s64_x2(int64_t *__a, int64x2x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv2di(__o, (int64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv2di(__o, (int64x2_t)val.val[1], 1); __builtin_aarch64_st1x2v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1q_u8_x2(uint8_t *__a, uint8x16x2_t val) { +__funline void vst1q_u8_x2(uint8_t *__a, uint8x16x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[1], 1); __builtin_aarch64_st1x2v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1q_u16_x2(uint16_t *__a, uint16x8x2_t val) { +__funline void vst1q_u16_x2(uint16_t *__a, uint16x8x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[1], 1); __builtin_aarch64_st1x2v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1q_u32_x2(uint32_t *__a, uint32x4x2_t val) { +__funline void vst1q_u32_x2(uint32_t *__a, uint32x4x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)val.val[1], 1); __builtin_aarch64_st1x2v4si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst1q_u64_x2(uint64_t *__a, uint64x2x2_t val) { +__funline void vst1q_u64_x2(uint64_t *__a, uint64x2x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv2di(__o, (int64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv2di(__o, (int64x2_t)val.val[1], 1); __builtin_aarch64_st1x2v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1q_f16_x2(float16_t *__a, float16x8x2_t val) { +__funline void vst1q_f16_x2(float16_t *__a, float16x8x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv8hf(__o, val.val[0], 0); __o = __builtin_aarch64_set_qregoiv8hf(__o, val.val[1], 1); __builtin_aarch64_st1x2v8hf(__a, __o); } -FUNK void vst1q_f32_x2(float32_t *__a, float32x4x2_t val) { +__funline void vst1q_f32_x2(float32_t *__a, float32x4x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv4sf(__o, (float32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv4sf(__o, (float32x4_t)val.val[1], 1); __builtin_aarch64_st1x2v4sf((__builtin_aarch64_simd_sf *)__a, __o); } -FUNK void vst1q_f64_x2(float64_t *__a, float64x2x2_t val) { +__funline void vst1q_f64_x2(float64_t *__a, float64x2x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv2df(__o, (float64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv2df(__o, (float64x2_t)val.val[1], 1); __builtin_aarch64_st1x2v2df((__builtin_aarch64_simd_df *)__a, __o); } -FUNK void vst1q_p64_x2(poly64_t *__a, poly64x2x2_t val) { +__funline void vst1q_p64_x2(poly64_t *__a, poly64x2x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv2di_ssps(__o, (poly64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv2di_ssps(__o, (poly64x2_t)val.val[1], 1); __builtin_aarch64_st1x2v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1_s64_x3(int64_t *__a, int64x1x3_t val) { +__funline void vst1_s64_x3(int64_t *__a, int64x1x3_t val) { __builtin_aarch64_simd_ci __o; int64x2x3_t temp; temp.val[0] = vcombine_s64(val.val[0], vcreate_s64(__AARCH64_INT64_C(0))); @@ -18150,7 +18267,7 @@ FUNK void vst1_s64_x3(int64_t *__a, int64x1x3_t val) { __builtin_aarch64_st1x3di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1_u64_x3(uint64_t *__a, uint64x1x3_t val) { +__funline void vst1_u64_x3(uint64_t *__a, uint64x1x3_t val) { __builtin_aarch64_simd_ci __o; uint64x2x3_t temp; temp.val[0] = vcombine_u64(val.val[0], vcreate_u64(__AARCH64_UINT64_C(0))); @@ -18162,7 +18279,7 @@ FUNK void vst1_u64_x3(uint64_t *__a, uint64x1x3_t val) { __builtin_aarch64_st1x3di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1_f64_x3(float64_t *__a, float64x1x3_t val) { +__funline void vst1_f64_x3(float64_t *__a, float64x1x3_t val) { __builtin_aarch64_simd_ci __o; float64x2x3_t temp; temp.val[0] = vcombine_f64(val.val[0], vcreate_f64(__AARCH64_UINT64_C(0))); @@ -18174,7 +18291,7 @@ FUNK void vst1_f64_x3(float64_t *__a, float64x1x3_t val) { __builtin_aarch64_st1x3df((__builtin_aarch64_simd_df *)__a, __o); } -FUNK void vst1_s8_x3(int8_t *__a, int8x8x3_t val) { +__funline void vst1_s8_x3(int8_t *__a, int8x8x3_t val) { __builtin_aarch64_simd_ci __o; int8x16x3_t temp; temp.val[0] = vcombine_s8(val.val[0], vcreate_s8(__AARCH64_INT64_C(0))); @@ -18186,7 +18303,7 @@ FUNK void vst1_s8_x3(int8_t *__a, int8x8x3_t val) { __builtin_aarch64_st1x3v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1_p8_x3(poly8_t *__a, poly8x8x3_t val) { +__funline void vst1_p8_x3(poly8_t *__a, poly8x8x3_t val) { __builtin_aarch64_simd_ci __o; poly8x16x3_t temp; temp.val[0] = vcombine_p8(val.val[0], vcreate_p8(__AARCH64_UINT64_C(0))); @@ -18198,7 +18315,7 @@ FUNK void vst1_p8_x3(poly8_t *__a, poly8x8x3_t val) { __builtin_aarch64_st1x3v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1_s16_x3(int16_t *__a, int16x4x3_t val) { +__funline void vst1_s16_x3(int16_t *__a, int16x4x3_t val) { __builtin_aarch64_simd_ci __o; int16x8x3_t temp; temp.val[0] = vcombine_s16(val.val[0], vcreate_s16(__AARCH64_INT64_C(0))); @@ -18210,7 +18327,7 @@ FUNK void vst1_s16_x3(int16_t *__a, int16x4x3_t val) { __builtin_aarch64_st1x3v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1_p16_x3(poly16_t *__a, poly16x4x3_t val) { +__funline void vst1_p16_x3(poly16_t *__a, poly16x4x3_t val) { __builtin_aarch64_simd_ci __o; poly16x8x3_t temp; temp.val[0] = vcombine_p16(val.val[0], vcreate_p16(__AARCH64_UINT64_C(0))); @@ -18222,7 +18339,7 @@ FUNK void vst1_p16_x3(poly16_t *__a, poly16x4x3_t val) { __builtin_aarch64_st1x3v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1_s32_x3(int32_t *__a, int32x2x3_t val) { +__funline void vst1_s32_x3(int32_t *__a, int32x2x3_t val) { __builtin_aarch64_simd_ci __o; int32x4x3_t temp; temp.val[0] = vcombine_s32(val.val[0], vcreate_s32(__AARCH64_INT64_C(0))); @@ -18234,7 +18351,7 @@ FUNK void vst1_s32_x3(int32_t *__a, int32x2x3_t val) { __builtin_aarch64_st1x3v2si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst1_u8_x3(uint8_t *__a, uint8x8x3_t val) { +__funline void vst1_u8_x3(uint8_t *__a, uint8x8x3_t val) { __builtin_aarch64_simd_ci __o; uint8x16x3_t temp; temp.val[0] = vcombine_u8(val.val[0], vcreate_u8(__AARCH64_UINT64_C(0))); @@ -18246,7 +18363,7 @@ FUNK void vst1_u8_x3(uint8_t *__a, uint8x8x3_t val) { __builtin_aarch64_st1x3v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1_u16_x3(uint16_t *__a, uint16x4x3_t val) { +__funline void vst1_u16_x3(uint16_t *__a, uint16x4x3_t val) { __builtin_aarch64_simd_ci __o; uint16x8x3_t temp; temp.val[0] = vcombine_u16(val.val[0], vcreate_u16(__AARCH64_UINT64_C(0))); @@ -18258,7 +18375,7 @@ FUNK void vst1_u16_x3(uint16_t *__a, uint16x4x3_t val) { __builtin_aarch64_st1x3v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1_u32_x3(uint32_t *__a, uint32x2x3_t val) { +__funline void vst1_u32_x3(uint32_t *__a, uint32x2x3_t val) { __builtin_aarch64_simd_ci __o; uint32x4x3_t temp; temp.val[0] = vcombine_u32(val.val[0], vcreate_u32(__AARCH64_UINT64_C(0))); @@ -18270,7 +18387,7 @@ FUNK void vst1_u32_x3(uint32_t *__a, uint32x2x3_t val) { __builtin_aarch64_st1x3v2si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst1_f16_x3(float16_t *__a, float16x4x3_t val) { +__funline void vst1_f16_x3(float16_t *__a, float16x4x3_t val) { __builtin_aarch64_simd_ci __o; float16x8x3_t temp; temp.val[0] = vcombine_f16(val.val[0], vcreate_f16(__AARCH64_UINT64_C(0))); @@ -18282,7 +18399,7 @@ FUNK void vst1_f16_x3(float16_t *__a, float16x4x3_t val) { __builtin_aarch64_st1x3v4hf((__builtin_aarch64_simd_hf *)__a, __o); } -FUNK void vst1_f32_x3(float32_t *__a, float32x2x3_t val) { +__funline void vst1_f32_x3(float32_t *__a, float32x2x3_t val) { __builtin_aarch64_simd_ci __o; float32x4x3_t temp; temp.val[0] = vcombine_f32(val.val[0], vcreate_f32(__AARCH64_UINT64_C(0))); @@ -18294,7 +18411,7 @@ FUNK void vst1_f32_x3(float32_t *__a, float32x2x3_t val) { __builtin_aarch64_st1x3v2sf((__builtin_aarch64_simd_sf *)__a, __o); } -FUNK void vst1_p64_x3(poly64_t *__a, poly64x1x3_t val) { +__funline void vst1_p64_x3(poly64_t *__a, poly64x1x3_t val) { __builtin_aarch64_simd_ci __o; poly64x2x3_t temp; temp.val[0] = vcombine_p64(val.val[0], vcreate_p64(__AARCH64_UINT64_C(0))); @@ -18306,7 +18423,7 @@ FUNK void vst1_p64_x3(poly64_t *__a, poly64x1x3_t val) { __builtin_aarch64_st1x3di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1q_s8_x3(int8_t *__a, int8x16x3_t val) { +__funline void vst1q_s8_x3(int8_t *__a, int8x16x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[1], 1); @@ -18314,7 +18431,7 @@ FUNK void vst1q_s8_x3(int8_t *__a, int8x16x3_t val) { __builtin_aarch64_st1x3v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1q_p8_x3(poly8_t *__a, poly8x16x3_t val) { +__funline void vst1q_p8_x3(poly8_t *__a, poly8x16x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[1], 1); @@ -18322,7 +18439,7 @@ FUNK void vst1q_p8_x3(poly8_t *__a, poly8x16x3_t val) { __builtin_aarch64_st1x3v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1q_s16_x3(int16_t *__a, int16x8x3_t val) { +__funline void vst1q_s16_x3(int16_t *__a, int16x8x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[1], 1); @@ -18330,7 +18447,7 @@ FUNK void vst1q_s16_x3(int16_t *__a, int16x8x3_t val) { __builtin_aarch64_st1x3v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1q_p16_x3(poly16_t *__a, poly16x8x3_t val) { +__funline void vst1q_p16_x3(poly16_t *__a, poly16x8x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[1], 1); @@ -18338,7 +18455,7 @@ FUNK void vst1q_p16_x3(poly16_t *__a, poly16x8x3_t val) { __builtin_aarch64_st1x3v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1q_s32_x3(int32_t *__a, int32x4x3_t val) { +__funline void vst1q_s32_x3(int32_t *__a, int32x4x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)val.val[1], 1); @@ -18346,7 +18463,7 @@ FUNK void vst1q_s32_x3(int32_t *__a, int32x4x3_t val) { __builtin_aarch64_st1x3v4si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst1q_s64_x3(int64_t *__a, int64x2x3_t val) { +__funline void vst1q_s64_x3(int64_t *__a, int64x2x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv2di(__o, (int64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv2di(__o, (int64x2_t)val.val[1], 1); @@ -18354,7 +18471,7 @@ FUNK void vst1q_s64_x3(int64_t *__a, int64x2x3_t val) { __builtin_aarch64_st1x3v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1q_u8_x3(uint8_t *__a, uint8x16x3_t val) { +__funline void vst1q_u8_x3(uint8_t *__a, uint8x16x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[1], 1); @@ -18362,7 +18479,7 @@ FUNK void vst1q_u8_x3(uint8_t *__a, uint8x16x3_t val) { __builtin_aarch64_st1x3v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst1q_u16_x3(uint16_t *__a, uint16x8x3_t val) { +__funline void vst1q_u16_x3(uint16_t *__a, uint16x8x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[1], 1); @@ -18370,7 +18487,7 @@ FUNK void vst1q_u16_x3(uint16_t *__a, uint16x8x3_t val) { __builtin_aarch64_st1x3v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst1q_u32_x3(uint32_t *__a, uint32x4x3_t val) { +__funline void vst1q_u32_x3(uint32_t *__a, uint32x4x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)val.val[1], 1); @@ -18378,7 +18495,7 @@ FUNK void vst1q_u32_x3(uint32_t *__a, uint32x4x3_t val) { __builtin_aarch64_st1x3v4si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst1q_u64_x3(uint64_t *__a, uint64x2x3_t val) { +__funline void vst1q_u64_x3(uint64_t *__a, uint64x2x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv2di(__o, (int64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv2di(__o, (int64x2_t)val.val[1], 1); @@ -18386,7 +18503,7 @@ FUNK void vst1q_u64_x3(uint64_t *__a, uint64x2x3_t val) { __builtin_aarch64_st1x3v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst1q_f16_x3(float16_t *__a, float16x8x3_t val) { +__funline void vst1q_f16_x3(float16_t *__a, float16x8x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv8hf(__o, (float16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv8hf(__o, (float16x8_t)val.val[1], 1); @@ -18394,7 +18511,7 @@ FUNK void vst1q_f16_x3(float16_t *__a, float16x8x3_t val) { __builtin_aarch64_st1x3v8hf((__builtin_aarch64_simd_hf *)__a, __o); } -FUNK void vst1q_f32_x3(float32_t *__a, float32x4x3_t val) { +__funline void vst1q_f32_x3(float32_t *__a, float32x4x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv4sf(__o, (float32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv4sf(__o, (float32x4_t)val.val[1], 1); @@ -18402,7 +18519,7 @@ FUNK void vst1q_f32_x3(float32_t *__a, float32x4x3_t val) { __builtin_aarch64_st1x3v4sf((__builtin_aarch64_simd_sf *)__a, __o); } -FUNK void vst1q_f64_x3(float64_t *__a, float64x2x3_t val) { +__funline void vst1q_f64_x3(float64_t *__a, float64x2x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv2df(__o, (float64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv2df(__o, (float64x2_t)val.val[1], 1); @@ -18410,7 +18527,7 @@ FUNK void vst1q_f64_x3(float64_t *__a, float64x2x3_t val) { __builtin_aarch64_st1x3v2df((__builtin_aarch64_simd_df *)__a, __o); } -FUNK void vst1q_p64_x3(poly64_t *__a, poly64x2x3_t val) { +__funline void vst1q_p64_x3(poly64_t *__a, poly64x2x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv2di_ssps(__o, (poly64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv2di_ssps(__o, (poly64x2_t)val.val[1], 1); @@ -18418,7 +18535,7 @@ FUNK void vst1q_p64_x3(poly64_t *__a, poly64x2x3_t val) { __builtin_aarch64_st1x3v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst2_s64(int64_t *__a, int64x1x2_t val) { +__funline void vst2_s64(int64_t *__a, int64x1x2_t val) { __builtin_aarch64_simd_oi __o; int64x2x2_t temp; temp.val[0] = vcombine_s64(val.val[0], vcreate_s64(__AARCH64_INT64_C(0))); @@ -18428,7 +18545,7 @@ FUNK void vst2_s64(int64_t *__a, int64x1x2_t val) { __builtin_aarch64_st2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst2_u64(uint64_t *__a, uint64x1x2_t val) { +__funline void vst2_u64(uint64_t *__a, uint64x1x2_t val) { __builtin_aarch64_simd_oi __o; uint64x2x2_t temp; temp.val[0] = vcombine_u64(val.val[0], vcreate_u64(__AARCH64_UINT64_C(0))); @@ -18438,7 +18555,7 @@ FUNK void vst2_u64(uint64_t *__a, uint64x1x2_t val) { __builtin_aarch64_st2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst2_f64(float64_t *__a, float64x1x2_t val) { +__funline void vst2_f64(float64_t *__a, float64x1x2_t val) { __builtin_aarch64_simd_oi __o; float64x2x2_t temp; temp.val[0] = vcombine_f64(val.val[0], vcreate_f64(__AARCH64_UINT64_C(0))); @@ -18448,7 +18565,7 @@ FUNK void vst2_f64(float64_t *__a, float64x1x2_t val) { __builtin_aarch64_st2df((__builtin_aarch64_simd_df *)__a, __o); } -FUNK void vst2_s8(int8_t *__a, int8x8x2_t val) { +__funline void vst2_s8(int8_t *__a, int8x8x2_t val) { __builtin_aarch64_simd_oi __o; int8x16x2_t temp; temp.val[0] = vcombine_s8(val.val[0], vcreate_s8(__AARCH64_INT64_C(0))); @@ -18458,7 +18575,7 @@ FUNK void vst2_s8(int8_t *__a, int8x8x2_t val) { __builtin_aarch64_st2v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst2_p8(poly8_t *__a, poly8x8x2_t val) { +__funline void vst2_p8(poly8_t *__a, poly8x8x2_t val) { __builtin_aarch64_simd_oi __o; poly8x16x2_t temp; temp.val[0] = vcombine_p8(val.val[0], vcreate_p8(__AARCH64_UINT64_C(0))); @@ -18468,7 +18585,7 @@ FUNK void vst2_p8(poly8_t *__a, poly8x8x2_t val) { __builtin_aarch64_st2v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst2_s16(int16_t *__a, int16x4x2_t val) { +__funline void vst2_s16(int16_t *__a, int16x4x2_t val) { __builtin_aarch64_simd_oi __o; int16x8x2_t temp; temp.val[0] = vcombine_s16(val.val[0], vcreate_s16(__AARCH64_INT64_C(0))); @@ -18478,7 +18595,7 @@ FUNK void vst2_s16(int16_t *__a, int16x4x2_t val) { __builtin_aarch64_st2v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst2_p16(poly16_t *__a, poly16x4x2_t val) { +__funline void vst2_p16(poly16_t *__a, poly16x4x2_t val) { __builtin_aarch64_simd_oi __o; poly16x8x2_t temp; temp.val[0] = vcombine_p16(val.val[0], vcreate_p16(__AARCH64_UINT64_C(0))); @@ -18488,7 +18605,7 @@ FUNK void vst2_p16(poly16_t *__a, poly16x4x2_t val) { __builtin_aarch64_st2v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst2_s32(int32_t *__a, int32x2x2_t val) { +__funline void vst2_s32(int32_t *__a, int32x2x2_t val) { __builtin_aarch64_simd_oi __o; int32x4x2_t temp; temp.val[0] = vcombine_s32(val.val[0], vcreate_s32(__AARCH64_INT64_C(0))); @@ -18498,7 +18615,7 @@ FUNK void vst2_s32(int32_t *__a, int32x2x2_t val) { __builtin_aarch64_st2v2si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst2_u8(uint8_t *__a, uint8x8x2_t val) { +__funline void vst2_u8(uint8_t *__a, uint8x8x2_t val) { __builtin_aarch64_simd_oi __o; uint8x16x2_t temp; temp.val[0] = vcombine_u8(val.val[0], vcreate_u8(__AARCH64_UINT64_C(0))); @@ -18508,7 +18625,7 @@ FUNK void vst2_u8(uint8_t *__a, uint8x8x2_t val) { __builtin_aarch64_st2v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst2_u16(uint16_t *__a, uint16x4x2_t val) { +__funline void vst2_u16(uint16_t *__a, uint16x4x2_t val) { __builtin_aarch64_simd_oi __o; uint16x8x2_t temp; temp.val[0] = vcombine_u16(val.val[0], vcreate_u16(__AARCH64_UINT64_C(0))); @@ -18518,7 +18635,7 @@ FUNK void vst2_u16(uint16_t *__a, uint16x4x2_t val) { __builtin_aarch64_st2v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst2_u32(uint32_t *__a, uint32x2x2_t val) { +__funline void vst2_u32(uint32_t *__a, uint32x2x2_t val) { __builtin_aarch64_simd_oi __o; uint32x4x2_t temp; temp.val[0] = vcombine_u32(val.val[0], vcreate_u32(__AARCH64_UINT64_C(0))); @@ -18528,7 +18645,7 @@ FUNK void vst2_u32(uint32_t *__a, uint32x2x2_t val) { __builtin_aarch64_st2v2si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst2_f16(float16_t *__a, float16x4x2_t val) { +__funline void vst2_f16(float16_t *__a, float16x4x2_t val) { __builtin_aarch64_simd_oi __o; float16x8x2_t temp; temp.val[0] = vcombine_f16(val.val[0], vcreate_f16(__AARCH64_UINT64_C(0))); @@ -18538,7 +18655,7 @@ FUNK void vst2_f16(float16_t *__a, float16x4x2_t val) { __builtin_aarch64_st2v4hf(__a, __o); } -FUNK void vst2_f32(float32_t *__a, float32x2x2_t val) { +__funline void vst2_f32(float32_t *__a, float32x2x2_t val) { __builtin_aarch64_simd_oi __o; float32x4x2_t temp; temp.val[0] = vcombine_f32(val.val[0], vcreate_f32(__AARCH64_UINT64_C(0))); @@ -18548,7 +18665,7 @@ FUNK void vst2_f32(float32_t *__a, float32x2x2_t val) { __builtin_aarch64_st2v2sf((__builtin_aarch64_simd_sf *)__a, __o); } -FUNK void vst2_p64(poly64_t *__a, poly64x1x2_t val) { +__funline void vst2_p64(poly64_t *__a, poly64x1x2_t val) { __builtin_aarch64_simd_oi __o; poly64x2x2_t temp; temp.val[0] = vcombine_p64(val.val[0], vcreate_p64(__AARCH64_UINT64_C(0))); @@ -18558,105 +18675,105 @@ FUNK void vst2_p64(poly64_t *__a, poly64x1x2_t val) { __builtin_aarch64_st2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst2q_s8(int8_t *__a, int8x16x2_t val) { +__funline void vst2q_s8(int8_t *__a, int8x16x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[1], 1); __builtin_aarch64_st2v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst2q_p8(poly8_t *__a, poly8x16x2_t val) { +__funline void vst2q_p8(poly8_t *__a, poly8x16x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[1], 1); __builtin_aarch64_st2v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst2q_s16(int16_t *__a, int16x8x2_t val) { +__funline void vst2q_s16(int16_t *__a, int16x8x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[1], 1); __builtin_aarch64_st2v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst2q_p16(poly16_t *__a, poly16x8x2_t val) { +__funline void vst2q_p16(poly16_t *__a, poly16x8x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[1], 1); __builtin_aarch64_st2v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst2q_s32(int32_t *__a, int32x4x2_t val) { +__funline void vst2q_s32(int32_t *__a, int32x4x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)val.val[1], 1); __builtin_aarch64_st2v4si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst2q_s64(int64_t *__a, int64x2x2_t val) { +__funline void vst2q_s64(int64_t *__a, int64x2x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv2di(__o, (int64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv2di(__o, (int64x2_t)val.val[1], 1); __builtin_aarch64_st2v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst2q_u8(uint8_t *__a, uint8x16x2_t val) { +__funline void vst2q_u8(uint8_t *__a, uint8x16x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi(__o, (int8x16_t)val.val[1], 1); __builtin_aarch64_st2v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst2q_u16(uint16_t *__a, uint16x8x2_t val) { +__funline void vst2q_u16(uint16_t *__a, uint16x8x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv8hi(__o, (int16x8_t)val.val[1], 1); __builtin_aarch64_st2v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst2q_u32(uint32_t *__a, uint32x4x2_t val) { +__funline void vst2q_u32(uint32_t *__a, uint32x4x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv4si(__o, (int32x4_t)val.val[1], 1); __builtin_aarch64_st2v4si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst2q_u64(uint64_t *__a, uint64x2x2_t val) { +__funline void vst2q_u64(uint64_t *__a, uint64x2x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv2di(__o, (int64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv2di(__o, (int64x2_t)val.val[1], 1); __builtin_aarch64_st2v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst2q_f16(float16_t *__a, float16x8x2_t val) { +__funline void vst2q_f16(float16_t *__a, float16x8x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv8hf(__o, val.val[0], 0); __o = __builtin_aarch64_set_qregoiv8hf(__o, val.val[1], 1); __builtin_aarch64_st2v8hf(__a, __o); } -FUNK void vst2q_f32(float32_t *__a, float32x4x2_t val) { +__funline void vst2q_f32(float32_t *__a, float32x4x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv4sf(__o, (float32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv4sf(__o, (float32x4_t)val.val[1], 1); __builtin_aarch64_st2v4sf((__builtin_aarch64_simd_sf *)__a, __o); } -FUNK void vst2q_f64(float64_t *__a, float64x2x2_t val) { +__funline void vst2q_f64(float64_t *__a, float64x2x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv2df(__o, (float64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv2df(__o, (float64x2_t)val.val[1], 1); __builtin_aarch64_st2v2df((__builtin_aarch64_simd_df *)__a, __o); } -FUNK void vst2q_p64(poly64_t *__a, poly64x2x2_t val) { +__funline void vst2q_p64(poly64_t *__a, poly64x2x2_t val) { __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_set_qregoiv2di_ssps(__o, (poly64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregoiv2di_ssps(__o, (poly64x2_t)val.val[1], 1); __builtin_aarch64_st2v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst3_s64(int64_t *__a, int64x1x3_t val) { +__funline void vst3_s64(int64_t *__a, int64x1x3_t val) { __builtin_aarch64_simd_ci __o; int64x2x3_t temp; temp.val[0] = vcombine_s64(val.val[0], vcreate_s64(__AARCH64_INT64_C(0))); @@ -18668,7 +18785,7 @@ FUNK void vst3_s64(int64_t *__a, int64x1x3_t val) { __builtin_aarch64_st3di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst3_u64(uint64_t *__a, uint64x1x3_t val) { +__funline void vst3_u64(uint64_t *__a, uint64x1x3_t val) { __builtin_aarch64_simd_ci __o; uint64x2x3_t temp; temp.val[0] = vcombine_u64(val.val[0], vcreate_u64(__AARCH64_UINT64_C(0))); @@ -18680,7 +18797,7 @@ FUNK void vst3_u64(uint64_t *__a, uint64x1x3_t val) { __builtin_aarch64_st3di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst3_f64(float64_t *__a, float64x1x3_t val) { +__funline void vst3_f64(float64_t *__a, float64x1x3_t val) { __builtin_aarch64_simd_ci __o; float64x2x3_t temp; temp.val[0] = vcombine_f64(val.val[0], vcreate_f64(__AARCH64_UINT64_C(0))); @@ -18692,7 +18809,7 @@ FUNK void vst3_f64(float64_t *__a, float64x1x3_t val) { __builtin_aarch64_st3df((__builtin_aarch64_simd_df *)__a, __o); } -FUNK void vst3_s8(int8_t *__a, int8x8x3_t val) { +__funline void vst3_s8(int8_t *__a, int8x8x3_t val) { __builtin_aarch64_simd_ci __o; int8x16x3_t temp; temp.val[0] = vcombine_s8(val.val[0], vcreate_s8(__AARCH64_INT64_C(0))); @@ -18704,7 +18821,7 @@ FUNK void vst3_s8(int8_t *__a, int8x8x3_t val) { __builtin_aarch64_st3v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst3_p8(poly8_t *__a, poly8x8x3_t val) { +__funline void vst3_p8(poly8_t *__a, poly8x8x3_t val) { __builtin_aarch64_simd_ci __o; poly8x16x3_t temp; temp.val[0] = vcombine_p8(val.val[0], vcreate_p8(__AARCH64_UINT64_C(0))); @@ -18716,7 +18833,7 @@ FUNK void vst3_p8(poly8_t *__a, poly8x8x3_t val) { __builtin_aarch64_st3v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst3_s16(int16_t *__a, int16x4x3_t val) { +__funline void vst3_s16(int16_t *__a, int16x4x3_t val) { __builtin_aarch64_simd_ci __o; int16x8x3_t temp; temp.val[0] = vcombine_s16(val.val[0], vcreate_s16(__AARCH64_INT64_C(0))); @@ -18728,7 +18845,7 @@ FUNK void vst3_s16(int16_t *__a, int16x4x3_t val) { __builtin_aarch64_st3v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst3_p16(poly16_t *__a, poly16x4x3_t val) { +__funline void vst3_p16(poly16_t *__a, poly16x4x3_t val) { __builtin_aarch64_simd_ci __o; poly16x8x3_t temp; temp.val[0] = vcombine_p16(val.val[0], vcreate_p16(__AARCH64_UINT64_C(0))); @@ -18740,7 +18857,7 @@ FUNK void vst3_p16(poly16_t *__a, poly16x4x3_t val) { __builtin_aarch64_st3v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst3_s32(int32_t *__a, int32x2x3_t val) { +__funline void vst3_s32(int32_t *__a, int32x2x3_t val) { __builtin_aarch64_simd_ci __o; int32x4x3_t temp; temp.val[0] = vcombine_s32(val.val[0], vcreate_s32(__AARCH64_INT64_C(0))); @@ -18752,7 +18869,7 @@ FUNK void vst3_s32(int32_t *__a, int32x2x3_t val) { __builtin_aarch64_st3v2si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst3_u8(uint8_t *__a, uint8x8x3_t val) { +__funline void vst3_u8(uint8_t *__a, uint8x8x3_t val) { __builtin_aarch64_simd_ci __o; uint8x16x3_t temp; temp.val[0] = vcombine_u8(val.val[0], vcreate_u8(__AARCH64_UINT64_C(0))); @@ -18764,7 +18881,7 @@ FUNK void vst3_u8(uint8_t *__a, uint8x8x3_t val) { __builtin_aarch64_st3v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst3_u16(uint16_t *__a, uint16x4x3_t val) { +__funline void vst3_u16(uint16_t *__a, uint16x4x3_t val) { __builtin_aarch64_simd_ci __o; uint16x8x3_t temp; temp.val[0] = vcombine_u16(val.val[0], vcreate_u16(__AARCH64_UINT64_C(0))); @@ -18776,7 +18893,7 @@ FUNK void vst3_u16(uint16_t *__a, uint16x4x3_t val) { __builtin_aarch64_st3v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst3_u32(uint32_t *__a, uint32x2x3_t val) { +__funline void vst3_u32(uint32_t *__a, uint32x2x3_t val) { __builtin_aarch64_simd_ci __o; uint32x4x3_t temp; temp.val[0] = vcombine_u32(val.val[0], vcreate_u32(__AARCH64_UINT64_C(0))); @@ -18788,7 +18905,7 @@ FUNK void vst3_u32(uint32_t *__a, uint32x2x3_t val) { __builtin_aarch64_st3v2si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst3_f16(float16_t *__a, float16x4x3_t val) { +__funline void vst3_f16(float16_t *__a, float16x4x3_t val) { __builtin_aarch64_simd_ci __o; float16x8x3_t temp; temp.val[0] = vcombine_f16(val.val[0], vcreate_f16(__AARCH64_UINT64_C(0))); @@ -18800,7 +18917,7 @@ FUNK void vst3_f16(float16_t *__a, float16x4x3_t val) { __builtin_aarch64_st3v4hf((__builtin_aarch64_simd_hf *)__a, __o); } -FUNK void vst3_f32(float32_t *__a, float32x2x3_t val) { +__funline void vst3_f32(float32_t *__a, float32x2x3_t val) { __builtin_aarch64_simd_ci __o; float32x4x3_t temp; temp.val[0] = vcombine_f32(val.val[0], vcreate_f32(__AARCH64_UINT64_C(0))); @@ -18812,7 +18929,7 @@ FUNK void vst3_f32(float32_t *__a, float32x2x3_t val) { __builtin_aarch64_st3v2sf((__builtin_aarch64_simd_sf *)__a, __o); } -FUNK void vst3_p64(poly64_t *__a, poly64x1x3_t val) { +__funline void vst3_p64(poly64_t *__a, poly64x1x3_t val) { __builtin_aarch64_simd_ci __o; poly64x2x3_t temp; temp.val[0] = vcombine_p64(val.val[0], vcreate_p64(__AARCH64_UINT64_C(0))); @@ -18824,7 +18941,7 @@ FUNK void vst3_p64(poly64_t *__a, poly64x1x3_t val) { __builtin_aarch64_st3di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst3q_s8(int8_t *__a, int8x16x3_t val) { +__funline void vst3q_s8(int8_t *__a, int8x16x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[1], 1); @@ -18832,7 +18949,7 @@ FUNK void vst3q_s8(int8_t *__a, int8x16x3_t val) { __builtin_aarch64_st3v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst3q_p8(poly8_t *__a, poly8x16x3_t val) { +__funline void vst3q_p8(poly8_t *__a, poly8x16x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[1], 1); @@ -18840,7 +18957,7 @@ FUNK void vst3q_p8(poly8_t *__a, poly8x16x3_t val) { __builtin_aarch64_st3v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst3q_s16(int16_t *__a, int16x8x3_t val) { +__funline void vst3q_s16(int16_t *__a, int16x8x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[1], 1); @@ -18848,7 +18965,7 @@ FUNK void vst3q_s16(int16_t *__a, int16x8x3_t val) { __builtin_aarch64_st3v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst3q_p16(poly16_t *__a, poly16x8x3_t val) { +__funline void vst3q_p16(poly16_t *__a, poly16x8x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[1], 1); @@ -18856,7 +18973,7 @@ FUNK void vst3q_p16(poly16_t *__a, poly16x8x3_t val) { __builtin_aarch64_st3v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst3q_s32(int32_t *__a, int32x4x3_t val) { +__funline void vst3q_s32(int32_t *__a, int32x4x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)val.val[1], 1); @@ -18864,7 +18981,7 @@ FUNK void vst3q_s32(int32_t *__a, int32x4x3_t val) { __builtin_aarch64_st3v4si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst3q_s64(int64_t *__a, int64x2x3_t val) { +__funline void vst3q_s64(int64_t *__a, int64x2x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv2di(__o, (int64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv2di(__o, (int64x2_t)val.val[1], 1); @@ -18872,7 +18989,7 @@ FUNK void vst3q_s64(int64_t *__a, int64x2x3_t val) { __builtin_aarch64_st3v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst3q_u8(uint8_t *__a, uint8x16x3_t val) { +__funline void vst3q_u8(uint8_t *__a, uint8x16x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv16qi(__o, (int8x16_t)val.val[1], 1); @@ -18880,7 +18997,7 @@ FUNK void vst3q_u8(uint8_t *__a, uint8x16x3_t val) { __builtin_aarch64_st3v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst3q_u16(uint16_t *__a, uint16x8x3_t val) { +__funline void vst3q_u16(uint16_t *__a, uint16x8x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv8hi(__o, (int16x8_t)val.val[1], 1); @@ -18888,7 +19005,7 @@ FUNK void vst3q_u16(uint16_t *__a, uint16x8x3_t val) { __builtin_aarch64_st3v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst3q_u32(uint32_t *__a, uint32x4x3_t val) { +__funline void vst3q_u32(uint32_t *__a, uint32x4x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv4si(__o, (int32x4_t)val.val[1], 1); @@ -18896,7 +19013,7 @@ FUNK void vst3q_u32(uint32_t *__a, uint32x4x3_t val) { __builtin_aarch64_st3v4si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst3q_u64(uint64_t *__a, uint64x2x3_t val) { +__funline void vst3q_u64(uint64_t *__a, uint64x2x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv2di(__o, (int64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv2di(__o, (int64x2_t)val.val[1], 1); @@ -18904,7 +19021,7 @@ FUNK void vst3q_u64(uint64_t *__a, uint64x2x3_t val) { __builtin_aarch64_st3v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst3q_f16(float16_t *__a, float16x8x3_t val) { +__funline void vst3q_f16(float16_t *__a, float16x8x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv8hf(__o, (float16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv8hf(__o, (float16x8_t)val.val[1], 1); @@ -18912,7 +19029,7 @@ FUNK void vst3q_f16(float16_t *__a, float16x8x3_t val) { __builtin_aarch64_st3v8hf((__builtin_aarch64_simd_hf *)__a, __o); } -FUNK void vst3q_f32(float32_t *__a, float32x4x3_t val) { +__funline void vst3q_f32(float32_t *__a, float32x4x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv4sf(__o, (float32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv4sf(__o, (float32x4_t)val.val[1], 1); @@ -18920,7 +19037,7 @@ FUNK void vst3q_f32(float32_t *__a, float32x4x3_t val) { __builtin_aarch64_st3v4sf((__builtin_aarch64_simd_sf *)__a, __o); } -FUNK void vst3q_f64(float64_t *__a, float64x2x3_t val) { +__funline void vst3q_f64(float64_t *__a, float64x2x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv2df(__o, (float64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv2df(__o, (float64x2_t)val.val[1], 1); @@ -18928,7 +19045,7 @@ FUNK void vst3q_f64(float64_t *__a, float64x2x3_t val) { __builtin_aarch64_st3v2df((__builtin_aarch64_simd_df *)__a, __o); } -FUNK void vst3q_p64(poly64_t *__a, poly64x2x3_t val) { +__funline void vst3q_p64(poly64_t *__a, poly64x2x3_t val) { __builtin_aarch64_simd_ci __o; __o = __builtin_aarch64_set_qregciv2di_ssps(__o, (poly64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregciv2di_ssps(__o, (poly64x2_t)val.val[1], 1); @@ -18936,7 +19053,7 @@ FUNK void vst3q_p64(poly64_t *__a, poly64x2x3_t val) { __builtin_aarch64_st3v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst4_s64(int64_t *__a, int64x1x4_t val) { +__funline void vst4_s64(int64_t *__a, int64x1x4_t val) { __builtin_aarch64_simd_xi __o; int64x2x4_t temp; temp.val[0] = vcombine_s64(val.val[0], vcreate_s64(__AARCH64_INT64_C(0))); @@ -18950,7 +19067,7 @@ FUNK void vst4_s64(int64_t *__a, int64x1x4_t val) { __builtin_aarch64_st4di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst4_u64(uint64_t *__a, uint64x1x4_t val) { +__funline void vst4_u64(uint64_t *__a, uint64x1x4_t val) { __builtin_aarch64_simd_xi __o; uint64x2x4_t temp; temp.val[0] = vcombine_u64(val.val[0], vcreate_u64(__AARCH64_UINT64_C(0))); @@ -18964,7 +19081,7 @@ FUNK void vst4_u64(uint64_t *__a, uint64x1x4_t val) { __builtin_aarch64_st4di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst4_f64(float64_t *__a, float64x1x4_t val) { +__funline void vst4_f64(float64_t *__a, float64x1x4_t val) { __builtin_aarch64_simd_xi __o; float64x2x4_t temp; temp.val[0] = vcombine_f64(val.val[0], vcreate_f64(__AARCH64_UINT64_C(0))); @@ -18978,7 +19095,7 @@ FUNK void vst4_f64(float64_t *__a, float64x1x4_t val) { __builtin_aarch64_st4df((__builtin_aarch64_simd_df *)__a, __o); } -FUNK void vst4_s8(int8_t *__a, int8x8x4_t val) { +__funline void vst4_s8(int8_t *__a, int8x8x4_t val) { __builtin_aarch64_simd_xi __o; int8x16x4_t temp; temp.val[0] = vcombine_s8(val.val[0], vcreate_s8(__AARCH64_INT64_C(0))); @@ -18992,7 +19109,7 @@ FUNK void vst4_s8(int8_t *__a, int8x8x4_t val) { __builtin_aarch64_st4v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst4_p8(poly8_t *__a, poly8x8x4_t val) { +__funline void vst4_p8(poly8_t *__a, poly8x8x4_t val) { __builtin_aarch64_simd_xi __o; poly8x16x4_t temp; temp.val[0] = vcombine_p8(val.val[0], vcreate_p8(__AARCH64_UINT64_C(0))); @@ -19006,7 +19123,7 @@ FUNK void vst4_p8(poly8_t *__a, poly8x8x4_t val) { __builtin_aarch64_st4v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst4_s16(int16_t *__a, int16x4x4_t val) { +__funline void vst4_s16(int16_t *__a, int16x4x4_t val) { __builtin_aarch64_simd_xi __o; int16x8x4_t temp; temp.val[0] = vcombine_s16(val.val[0], vcreate_s16(__AARCH64_INT64_C(0))); @@ -19020,7 +19137,7 @@ FUNK void vst4_s16(int16_t *__a, int16x4x4_t val) { __builtin_aarch64_st4v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst4_p16(poly16_t *__a, poly16x4x4_t val) { +__funline void vst4_p16(poly16_t *__a, poly16x4x4_t val) { __builtin_aarch64_simd_xi __o; poly16x8x4_t temp; temp.val[0] = vcombine_p16(val.val[0], vcreate_p16(__AARCH64_UINT64_C(0))); @@ -19034,7 +19151,7 @@ FUNK void vst4_p16(poly16_t *__a, poly16x4x4_t val) { __builtin_aarch64_st4v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst4_s32(int32_t *__a, int32x2x4_t val) { +__funline void vst4_s32(int32_t *__a, int32x2x4_t val) { __builtin_aarch64_simd_xi __o; int32x4x4_t temp; temp.val[0] = vcombine_s32(val.val[0], vcreate_s32(__AARCH64_INT64_C(0))); @@ -19048,7 +19165,7 @@ FUNK void vst4_s32(int32_t *__a, int32x2x4_t val) { __builtin_aarch64_st4v2si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst4_u8(uint8_t *__a, uint8x8x4_t val) { +__funline void vst4_u8(uint8_t *__a, uint8x8x4_t val) { __builtin_aarch64_simd_xi __o; uint8x16x4_t temp; temp.val[0] = vcombine_u8(val.val[0], vcreate_u8(__AARCH64_UINT64_C(0))); @@ -19062,7 +19179,7 @@ FUNK void vst4_u8(uint8_t *__a, uint8x8x4_t val) { __builtin_aarch64_st4v8qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst4_u16(uint16_t *__a, uint16x4x4_t val) { +__funline void vst4_u16(uint16_t *__a, uint16x4x4_t val) { __builtin_aarch64_simd_xi __o; uint16x8x4_t temp; temp.val[0] = vcombine_u16(val.val[0], vcreate_u16(__AARCH64_UINT64_C(0))); @@ -19076,7 +19193,7 @@ FUNK void vst4_u16(uint16_t *__a, uint16x4x4_t val) { __builtin_aarch64_st4v4hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst4_u32(uint32_t *__a, uint32x2x4_t val) { +__funline void vst4_u32(uint32_t *__a, uint32x2x4_t val) { __builtin_aarch64_simd_xi __o; uint32x4x4_t temp; temp.val[0] = vcombine_u32(val.val[0], vcreate_u32(__AARCH64_UINT64_C(0))); @@ -19090,7 +19207,7 @@ FUNK void vst4_u32(uint32_t *__a, uint32x2x4_t val) { __builtin_aarch64_st4v2si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst4_f16(float16_t *__a, float16x4x4_t val) { +__funline void vst4_f16(float16_t *__a, float16x4x4_t val) { __builtin_aarch64_simd_xi __o; float16x8x4_t temp; temp.val[0] = vcombine_f16(val.val[0], vcreate_f16(__AARCH64_UINT64_C(0))); @@ -19104,7 +19221,7 @@ FUNK void vst4_f16(float16_t *__a, float16x4x4_t val) { __builtin_aarch64_st4v4hf((__builtin_aarch64_simd_hf *)__a, __o); } -FUNK void vst4_f32(float32_t *__a, float32x2x4_t val) { +__funline void vst4_f32(float32_t *__a, float32x2x4_t val) { __builtin_aarch64_simd_xi __o; float32x4x4_t temp; temp.val[0] = vcombine_f32(val.val[0], vcreate_f32(__AARCH64_UINT64_C(0))); @@ -19118,7 +19235,7 @@ FUNK void vst4_f32(float32_t *__a, float32x2x4_t val) { __builtin_aarch64_st4v2sf((__builtin_aarch64_simd_sf *)__a, __o); } -FUNK void vst4_p64(poly64_t *__a, poly64x1x4_t val) { +__funline void vst4_p64(poly64_t *__a, poly64x1x4_t val) { __builtin_aarch64_simd_xi __o; poly64x2x4_t temp; temp.val[0] = vcombine_p64(val.val[0], vcreate_p64(__AARCH64_UINT64_C(0))); @@ -19132,7 +19249,7 @@ FUNK void vst4_p64(poly64_t *__a, poly64x1x4_t val) { __builtin_aarch64_st4di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst4q_s8(int8_t *__a, int8x16x4_t val) { +__funline void vst4q_s8(int8_t *__a, int8x16x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)val.val[1], 1); @@ -19141,7 +19258,7 @@ FUNK void vst4q_s8(int8_t *__a, int8x16x4_t val) { __builtin_aarch64_st4v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst4q_p8(poly8_t *__a, poly8x16x4_t val) { +__funline void vst4q_p8(poly8_t *__a, poly8x16x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)val.val[1], 1); @@ -19150,7 +19267,7 @@ FUNK void vst4q_p8(poly8_t *__a, poly8x16x4_t val) { __builtin_aarch64_st4v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst4q_s16(int16_t *__a, int16x8x4_t val) { +__funline void vst4q_s16(int16_t *__a, int16x8x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv8hi(__o, (int16x8_t)val.val[1], 1); @@ -19159,7 +19276,7 @@ FUNK void vst4q_s16(int16_t *__a, int16x8x4_t val) { __builtin_aarch64_st4v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst4q_p16(poly16_t *__a, poly16x8x4_t val) { +__funline void vst4q_p16(poly16_t *__a, poly16x8x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv8hi(__o, (int16x8_t)val.val[1], 1); @@ -19168,7 +19285,7 @@ FUNK void vst4q_p16(poly16_t *__a, poly16x8x4_t val) { __builtin_aarch64_st4v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst4q_s32(int32_t *__a, int32x4x4_t val) { +__funline void vst4q_s32(int32_t *__a, int32x4x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)val.val[1], 1); @@ -19177,7 +19294,7 @@ FUNK void vst4q_s32(int32_t *__a, int32x4x4_t val) { __builtin_aarch64_st4v4si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst4q_s64(int64_t *__a, int64x2x4_t val) { +__funline void vst4q_s64(int64_t *__a, int64x2x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv2di(__o, (int64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv2di(__o, (int64x2_t)val.val[1], 1); @@ -19186,7 +19303,7 @@ FUNK void vst4q_s64(int64_t *__a, int64x2x4_t val) { __builtin_aarch64_st4v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst4q_u8(uint8_t *__a, uint8x16x4_t val) { +__funline void vst4q_u8(uint8_t *__a, uint8x16x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv16qi(__o, (int8x16_t)val.val[1], 1); @@ -19195,7 +19312,7 @@ FUNK void vst4q_u8(uint8_t *__a, uint8x16x4_t val) { __builtin_aarch64_st4v16qi((__builtin_aarch64_simd_qi *)__a, __o); } -FUNK void vst4q_u16(uint16_t *__a, uint16x8x4_t val) { +__funline void vst4q_u16(uint16_t *__a, uint16x8x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv8hi(__o, (int16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv8hi(__o, (int16x8_t)val.val[1], 1); @@ -19204,7 +19321,7 @@ FUNK void vst4q_u16(uint16_t *__a, uint16x8x4_t val) { __builtin_aarch64_st4v8hi((__builtin_aarch64_simd_hi *)__a, __o); } -FUNK void vst4q_u32(uint32_t *__a, uint32x4x4_t val) { +__funline void vst4q_u32(uint32_t *__a, uint32x4x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv4si(__o, (int32x4_t)val.val[1], 1); @@ -19213,7 +19330,7 @@ FUNK void vst4q_u32(uint32_t *__a, uint32x4x4_t val) { __builtin_aarch64_st4v4si((__builtin_aarch64_simd_si *)__a, __o); } -FUNK void vst4q_u64(uint64_t *__a, uint64x2x4_t val) { +__funline void vst4q_u64(uint64_t *__a, uint64x2x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv2di(__o, (int64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv2di(__o, (int64x2_t)val.val[1], 1); @@ -19222,7 +19339,7 @@ FUNK void vst4q_u64(uint64_t *__a, uint64x2x4_t val) { __builtin_aarch64_st4v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK void vst4q_f16(float16_t *__a, float16x8x4_t val) { +__funline void vst4q_f16(float16_t *__a, float16x8x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv8hf(__o, (float16x8_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv8hf(__o, (float16x8_t)val.val[1], 1); @@ -19231,7 +19348,7 @@ FUNK void vst4q_f16(float16_t *__a, float16x8x4_t val) { __builtin_aarch64_st4v8hf((__builtin_aarch64_simd_hf *)__a, __o); } -FUNK void vst4q_f32(float32_t *__a, float32x4x4_t val) { +__funline void vst4q_f32(float32_t *__a, float32x4x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv4sf(__o, (float32x4_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv4sf(__o, (float32x4_t)val.val[1], 1); @@ -19240,7 +19357,7 @@ FUNK void vst4q_f32(float32_t *__a, float32x4x4_t val) { __builtin_aarch64_st4v4sf((__builtin_aarch64_simd_sf *)__a, __o); } -FUNK void vst4q_f64(float64_t *__a, float64x2x4_t val) { +__funline void vst4q_f64(float64_t *__a, float64x2x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv2df(__o, (float64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv2df(__o, (float64x2_t)val.val[1], 1); @@ -19249,7 +19366,7 @@ FUNK void vst4q_f64(float64_t *__a, float64x2x4_t val) { __builtin_aarch64_st4v2df((__builtin_aarch64_simd_df *)__a, __o); } -FUNK void vst4q_p64(poly64_t *__a, poly64x2x4_t val) { +__funline void vst4q_p64(poly64_t *__a, poly64x2x4_t val) { __builtin_aarch64_simd_xi __o; __o = __builtin_aarch64_set_qregxiv2di_ssps(__o, (poly64x2_t)val.val[0], 0); __o = __builtin_aarch64_set_qregxiv2di_ssps(__o, (poly64x2_t)val.val[1], 1); @@ -19258,57 +19375,59 @@ FUNK void vst4q_p64(poly64_t *__a, poly64x2x4_t val) { __builtin_aarch64_st4v2di((__builtin_aarch64_simd_di *)__a, __o); } -FUNK int64_t vsubd_s64(int64_t __a, int64_t __b) { +__funline int64_t vsubd_s64(int64_t __a, int64_t __b) { return __a - __b; } -FUNK uint64_t vsubd_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vsubd_u64(uint64_t __a, uint64_t __b) { return __a - __b; } -FUNK int8x8_t vtbx1_s8(int8x8_t __r, int8x8_t __tab, int8x8_t __idx) { +__funline int8x8_t vtbx1_s8(int8x8_t __r, int8x8_t __tab, int8x8_t __idx) { uint8x8_t __mask = vclt_u8(vreinterpret_u8_s8(__idx), vmov_n_u8(8)); int8x8_t __tbl = vtbl1_s8(__tab, __idx); return vbsl_s8(__mask, __tbl, __r); } -FUNK uint8x8_t vtbx1_u8(uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx) { +__funline uint8x8_t vtbx1_u8(uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx) { uint8x8_t __mask = vclt_u8(__idx, vmov_n_u8(8)); uint8x8_t __tbl = vtbl1_u8(__tab, __idx); return vbsl_u8(__mask, __tbl, __r); } -FUNK poly8x8_t vtbx1_p8(poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx) { +__funline poly8x8_t vtbx1_p8(poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx) { uint8x8_t __mask = vclt_u8(__idx, vmov_n_u8(8)); poly8x8_t __tbl = vtbl1_p8(__tab, __idx); return vbsl_p8(__mask, __tbl, __r); } -FUNK int8x8_t vtbx3_s8(int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx) { +__funline int8x8_t vtbx3_s8(int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx) { uint8x8_t __mask = vclt_u8(vreinterpret_u8_s8(__idx), vmov_n_u8(24)); int8x8_t __tbl = vtbl3_s8(__tab, __idx); return vbsl_s8(__mask, __tbl, __r); } -FUNK uint8x8_t vtbx3_u8(uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx) { +__funline uint8x8_t vtbx3_u8(uint8x8_t __r, uint8x8x3_t __tab, + uint8x8_t __idx) { uint8x8_t __mask = vclt_u8(__idx, vmov_n_u8(24)); uint8x8_t __tbl = vtbl3_u8(__tab, __idx); return vbsl_u8(__mask, __tbl, __r); } -FUNK poly8x8_t vtbx3_p8(poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) { +__funline poly8x8_t vtbx3_p8(poly8x8_t __r, poly8x8x3_t __tab, + uint8x8_t __idx) { uint8x8_t __mask = vclt_u8(__idx, vmov_n_u8(24)); poly8x8_t __tbl = vtbl3_p8(__tab, __idx); return vbsl_p8(__mask, __tbl, __r); } -FUNK int8x8_t vtbx4_s8(int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) { +__funline int8x8_t vtbx4_s8(int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) { int8x8_t result; int8x16x2_t temp; __builtin_aarch64_simd_oi __o; @@ -19320,7 +19439,8 @@ FUNK int8x8_t vtbx4_s8(int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) { return result; } -FUNK uint8x8_t vtbx4_u8(uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) { +__funline uint8x8_t vtbx4_u8(uint8x8_t __r, uint8x8x4_t __tab, + uint8x8_t __idx) { uint8x8_t result; uint8x16x2_t temp; __builtin_aarch64_simd_oi __o; @@ -19333,7 +19453,8 @@ FUNK uint8x8_t vtbx4_u8(uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) { return result; } -FUNK poly8x8_t vtbx4_p8(poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) { +__funline poly8x8_t vtbx4_p8(poly8x8_t __r, poly8x8x4_t __tab, + uint8x8_t __idx) { poly8x8_t result; poly8x16x2_t temp; __builtin_aarch64_simd_oi __o; @@ -19346,7 +19467,7 @@ FUNK poly8x8_t vtbx4_p8(poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) { return result; } -FUNK float16x4_t vtrn1_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vtrn1_f16(float16x4_t __a, float16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){5, 1, 7, 3}); #else @@ -19354,7 +19475,7 @@ FUNK float16x4_t vtrn1_f16(float16x4_t __a, float16x4_t __b) { #endif } -FUNK float32x2_t vtrn1_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vtrn1_f32(float32x2_t __a, float32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){3, 1}); #else @@ -19362,7 +19483,7 @@ FUNK float32x2_t vtrn1_f32(float32x2_t __a, float32x2_t __b) { #endif } -FUNK poly8x8_t vtrn1_p8(poly8x8_t __a, poly8x8_t __b) { +__funline poly8x8_t vtrn1_p8(poly8x8_t __a, poly8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){9, 1, 11, 3, 13, 5, 15, 7}); #else @@ -19370,7 +19491,7 @@ FUNK poly8x8_t vtrn1_p8(poly8x8_t __a, poly8x8_t __b) { #endif } -FUNK poly16x4_t vtrn1_p16(poly16x4_t __a, poly16x4_t __b) { +__funline poly16x4_t vtrn1_p16(poly16x4_t __a, poly16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){5, 1, 7, 3}); #else @@ -19378,7 +19499,7 @@ FUNK poly16x4_t vtrn1_p16(poly16x4_t __a, poly16x4_t __b) { #endif } -FUNK int8x8_t vtrn1_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vtrn1_s8(int8x8_t __a, int8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){9, 1, 11, 3, 13, 5, 15, 7}); #else @@ -19386,7 +19507,7 @@ FUNK int8x8_t vtrn1_s8(int8x8_t __a, int8x8_t __b) { #endif } -FUNK int16x4_t vtrn1_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vtrn1_s16(int16x4_t __a, int16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){5, 1, 7, 3}); #else @@ -19394,7 +19515,7 @@ FUNK int16x4_t vtrn1_s16(int16x4_t __a, int16x4_t __b) { #endif } -FUNK int32x2_t vtrn1_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vtrn1_s32(int32x2_t __a, int32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){3, 1}); #else @@ -19402,7 +19523,7 @@ FUNK int32x2_t vtrn1_s32(int32x2_t __a, int32x2_t __b) { #endif } -FUNK uint8x8_t vtrn1_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vtrn1_u8(uint8x8_t __a, uint8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){9, 1, 11, 3, 13, 5, 15, 7}); #else @@ -19410,7 +19531,7 @@ FUNK uint8x8_t vtrn1_u8(uint8x8_t __a, uint8x8_t __b) { #endif } -FUNK uint16x4_t vtrn1_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vtrn1_u16(uint16x4_t __a, uint16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){5, 1, 7, 3}); #else @@ -19418,7 +19539,7 @@ FUNK uint16x4_t vtrn1_u16(uint16x4_t __a, uint16x4_t __b) { #endif } -FUNK uint32x2_t vtrn1_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vtrn1_u32(uint32x2_t __a, uint32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){3, 1}); #else @@ -19426,7 +19547,7 @@ FUNK uint32x2_t vtrn1_u32(uint32x2_t __a, uint32x2_t __b) { #endif } -FUNK float16x8_t vtrn1q_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vtrn1q_f16(float16x8_t __a, float16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){9, 1, 11, 3, 13, 5, 15, 7}); #else @@ -19434,7 +19555,7 @@ FUNK float16x8_t vtrn1q_f16(float16x8_t __a, float16x8_t __b) { #endif } -FUNK float32x4_t vtrn1q_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vtrn1q_f32(float32x4_t __a, float32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){5, 1, 7, 3}); #else @@ -19442,7 +19563,7 @@ FUNK float32x4_t vtrn1q_f32(float32x4_t __a, float32x4_t __b) { #endif } -FUNK float64x2_t vtrn1q_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vtrn1q_f64(float64x2_t __a, float64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){3, 1}); #else @@ -19450,7 +19571,7 @@ FUNK float64x2_t vtrn1q_f64(float64x2_t __a, float64x2_t __b) { #endif } -FUNK poly8x16_t vtrn1q_p8(poly8x16_t __a, poly8x16_t __b) { +__funline poly8x16_t vtrn1q_p8(poly8x16_t __a, poly8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -19462,7 +19583,7 @@ FUNK poly8x16_t vtrn1q_p8(poly8x16_t __a, poly8x16_t __b) { #endif } -FUNK poly16x8_t vtrn1q_p16(poly16x8_t __a, poly16x8_t __b) { +__funline poly16x8_t vtrn1q_p16(poly16x8_t __a, poly16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){9, 1, 11, 3, 13, 5, 15, 7}); #else @@ -19470,7 +19591,7 @@ FUNK poly16x8_t vtrn1q_p16(poly16x8_t __a, poly16x8_t __b) { #endif } -FUNK int8x16_t vtrn1q_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vtrn1q_s8(int8x16_t __a, int8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -19482,7 +19603,7 @@ FUNK int8x16_t vtrn1q_s8(int8x16_t __a, int8x16_t __b) { #endif } -FUNK int16x8_t vtrn1q_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vtrn1q_s16(int16x8_t __a, int16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){9, 1, 11, 3, 13, 5, 15, 7}); #else @@ -19490,7 +19611,7 @@ FUNK int16x8_t vtrn1q_s16(int16x8_t __a, int16x8_t __b) { #endif } -FUNK int32x4_t vtrn1q_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vtrn1q_s32(int32x4_t __a, int32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){5, 1, 7, 3}); #else @@ -19498,7 +19619,7 @@ FUNK int32x4_t vtrn1q_s32(int32x4_t __a, int32x4_t __b) { #endif } -FUNK int64x2_t vtrn1q_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vtrn1q_s64(int64x2_t __a, int64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){3, 1}); #else @@ -19506,7 +19627,7 @@ FUNK int64x2_t vtrn1q_s64(int64x2_t __a, int64x2_t __b) { #endif } -FUNK uint8x16_t vtrn1q_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vtrn1q_u8(uint8x16_t __a, uint8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -19518,7 +19639,7 @@ FUNK uint8x16_t vtrn1q_u8(uint8x16_t __a, uint8x16_t __b) { #endif } -FUNK uint16x8_t vtrn1q_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vtrn1q_u16(uint16x8_t __a, uint16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){9, 1, 11, 3, 13, 5, 15, 7}); #else @@ -19526,7 +19647,7 @@ FUNK uint16x8_t vtrn1q_u16(uint16x8_t __a, uint16x8_t __b) { #endif } -FUNK uint32x4_t vtrn1q_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vtrn1q_u32(uint32x4_t __a, uint32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){5, 1, 7, 3}); #else @@ -19534,7 +19655,7 @@ FUNK uint32x4_t vtrn1q_u32(uint32x4_t __a, uint32x4_t __b) { #endif } -FUNK uint64x2_t vtrn1q_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vtrn1q_u64(uint64x2_t __a, uint64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){3, 1}); #else @@ -19542,7 +19663,7 @@ FUNK uint64x2_t vtrn1q_u64(uint64x2_t __a, uint64x2_t __b) { #endif } -FUNK float16x4_t vtrn2_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vtrn2_f16(float16x4_t __a, float16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 0, 6, 2}); #else @@ -19550,7 +19671,7 @@ FUNK float16x4_t vtrn2_f16(float16x4_t __a, float16x4_t __b) { #endif } -FUNK float32x2_t vtrn2_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vtrn2_f32(float32x2_t __a, float32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){2, 0}); #else @@ -19558,7 +19679,7 @@ FUNK float32x2_t vtrn2_f32(float32x2_t __a, float32x2_t __b) { #endif } -FUNK poly8x8_t vtrn2_p8(poly8x8_t __a, poly8x8_t __b) { +__funline poly8x8_t vtrn2_p8(poly8x8_t __a, poly8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){8, 0, 10, 2, 12, 4, 14, 6}); #else @@ -19566,7 +19687,7 @@ FUNK poly8x8_t vtrn2_p8(poly8x8_t __a, poly8x8_t __b) { #endif } -FUNK poly16x4_t vtrn2_p16(poly16x4_t __a, poly16x4_t __b) { +__funline poly16x4_t vtrn2_p16(poly16x4_t __a, poly16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 0, 6, 2}); #else @@ -19574,7 +19695,7 @@ FUNK poly16x4_t vtrn2_p16(poly16x4_t __a, poly16x4_t __b) { #endif } -FUNK int8x8_t vtrn2_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vtrn2_s8(int8x8_t __a, int8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){8, 0, 10, 2, 12, 4, 14, 6}); #else @@ -19582,7 +19703,7 @@ FUNK int8x8_t vtrn2_s8(int8x8_t __a, int8x8_t __b) { #endif } -FUNK int16x4_t vtrn2_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vtrn2_s16(int16x4_t __a, int16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 0, 6, 2}); #else @@ -19590,7 +19711,7 @@ FUNK int16x4_t vtrn2_s16(int16x4_t __a, int16x4_t __b) { #endif } -FUNK int32x2_t vtrn2_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vtrn2_s32(int32x2_t __a, int32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){2, 0}); #else @@ -19598,7 +19719,7 @@ FUNK int32x2_t vtrn2_s32(int32x2_t __a, int32x2_t __b) { #endif } -FUNK uint8x8_t vtrn2_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vtrn2_u8(uint8x8_t __a, uint8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){8, 0, 10, 2, 12, 4, 14, 6}); #else @@ -19606,7 +19727,7 @@ FUNK uint8x8_t vtrn2_u8(uint8x8_t __a, uint8x8_t __b) { #endif } -FUNK uint16x4_t vtrn2_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vtrn2_u16(uint16x4_t __a, uint16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 0, 6, 2}); #else @@ -19614,7 +19735,7 @@ FUNK uint16x4_t vtrn2_u16(uint16x4_t __a, uint16x4_t __b) { #endif } -FUNK uint32x2_t vtrn2_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vtrn2_u32(uint32x2_t __a, uint32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){2, 0}); #else @@ -19622,7 +19743,7 @@ FUNK uint32x2_t vtrn2_u32(uint32x2_t __a, uint32x2_t __b) { #endif } -FUNK float16x8_t vtrn2q_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vtrn2q_f16(float16x8_t __a, float16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 0, 10, 2, 12, 4, 14, 6}); #else @@ -19630,7 +19751,7 @@ FUNK float16x8_t vtrn2q_f16(float16x8_t __a, float16x8_t __b) { #endif } -FUNK float32x4_t vtrn2q_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vtrn2q_f32(float32x4_t __a, float32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){4, 0, 6, 2}); #else @@ -19638,7 +19759,7 @@ FUNK float32x4_t vtrn2q_f32(float32x4_t __a, float32x4_t __b) { #endif } -FUNK float64x2_t vtrn2q_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vtrn2q_f64(float64x2_t __a, float64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){2, 0}); #else @@ -19646,7 +19767,7 @@ FUNK float64x2_t vtrn2q_f64(float64x2_t __a, float64x2_t __b) { #endif } -FUNK poly8x16_t vtrn2q_p8(poly8x16_t __a, poly8x16_t __b) { +__funline poly8x16_t vtrn2q_p8(poly8x16_t __a, poly8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -19658,7 +19779,7 @@ FUNK poly8x16_t vtrn2q_p8(poly8x16_t __a, poly8x16_t __b) { #endif } -FUNK poly16x8_t vtrn2q_p16(poly16x8_t __a, poly16x8_t __b) { +__funline poly16x8_t vtrn2q_p16(poly16x8_t __a, poly16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 0, 10, 2, 12, 4, 14, 6}); #else @@ -19666,7 +19787,7 @@ FUNK poly16x8_t vtrn2q_p16(poly16x8_t __a, poly16x8_t __b) { #endif } -FUNK int8x16_t vtrn2q_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vtrn2q_s8(int8x16_t __a, int8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -19678,7 +19799,7 @@ FUNK int8x16_t vtrn2q_s8(int8x16_t __a, int8x16_t __b) { #endif } -FUNK int16x8_t vtrn2q_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vtrn2q_s16(int16x8_t __a, int16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 0, 10, 2, 12, 4, 14, 6}); #else @@ -19686,7 +19807,7 @@ FUNK int16x8_t vtrn2q_s16(int16x8_t __a, int16x8_t __b) { #endif } -FUNK int32x4_t vtrn2q_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vtrn2q_s32(int32x4_t __a, int32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){4, 0, 6, 2}); #else @@ -19694,7 +19815,7 @@ FUNK int32x4_t vtrn2q_s32(int32x4_t __a, int32x4_t __b) { #endif } -FUNK int64x2_t vtrn2q_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vtrn2q_s64(int64x2_t __a, int64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){2, 0}); #else @@ -19702,7 +19823,7 @@ FUNK int64x2_t vtrn2q_s64(int64x2_t __a, int64x2_t __b) { #endif } -FUNK uint8x16_t vtrn2q_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vtrn2q_u8(uint8x16_t __a, uint8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -19714,7 +19835,7 @@ FUNK uint8x16_t vtrn2q_u8(uint8x16_t __a, uint8x16_t __b) { #endif } -FUNK uint16x8_t vtrn2q_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vtrn2q_u16(uint16x8_t __a, uint16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 0, 10, 2, 12, 4, 14, 6}); #else @@ -19722,7 +19843,7 @@ FUNK uint16x8_t vtrn2q_u16(uint16x8_t __a, uint16x8_t __b) { #endif } -FUNK uint32x4_t vtrn2q_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vtrn2q_u32(uint32x4_t __a, uint32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){4, 0, 6, 2}); #else @@ -19730,7 +19851,7 @@ FUNK uint32x4_t vtrn2q_u32(uint32x4_t __a, uint32x4_t __b) { #endif } -FUNK uint64x2_t vtrn2q_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vtrn2q_u64(uint64x2_t __a, uint64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){2, 0}); #else @@ -19738,210 +19859,210 @@ FUNK uint64x2_t vtrn2q_u64(uint64x2_t __a, uint64x2_t __b) { #endif } -FUNK float16x4x2_t vtrn_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4x2_t vtrn_f16(float16x4_t __a, float16x4_t __b) { return (float16x4x2_t){vtrn1_f16(__a, __b), vtrn2_f16(__a, __b)}; } -FUNK float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b) { +__funline float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b) { return (float32x2x2_t){vtrn1_f32(a, b), vtrn2_f32(a, b)}; } -FUNK poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b) { +__funline poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b) { return (poly8x8x2_t){vtrn1_p8(a, b), vtrn2_p8(a, b)}; } -FUNK poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b) { +__funline poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b) { return (poly16x4x2_t){vtrn1_p16(a, b), vtrn2_p16(a, b)}; } -FUNK int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) { +__funline int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) { return (int8x8x2_t){vtrn1_s8(a, b), vtrn2_s8(a, b)}; } -FUNK int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) { +__funline int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) { return (int16x4x2_t){vtrn1_s16(a, b), vtrn2_s16(a, b)}; } -FUNK int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b) { +__funline int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b) { return (int32x2x2_t){vtrn1_s32(a, b), vtrn2_s32(a, b)}; } -FUNK uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b) { +__funline uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8x2_t){vtrn1_u8(a, b), vtrn2_u8(a, b)}; } -FUNK uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b) { +__funline uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4x2_t){vtrn1_u16(a, b), vtrn2_u16(a, b)}; } -FUNK uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b) { +__funline uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2x2_t){vtrn1_u32(a, b), vtrn2_u32(a, b)}; } -FUNK float16x8x2_t vtrnq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8x2_t vtrnq_f16(float16x8_t __a, float16x8_t __b) { return (float16x8x2_t){vtrn1q_f16(__a, __b), vtrn2q_f16(__a, __b)}; } -FUNK float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) { +__funline float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) { return (float32x4x2_t){vtrn1q_f32(a, b), vtrn2q_f32(a, b)}; } -FUNK poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b) { +__funline poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b) { return (poly8x16x2_t){vtrn1q_p8(a, b), vtrn2q_p8(a, b)}; } -FUNK poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b) { +__funline poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b) { return (poly16x8x2_t){vtrn1q_p16(a, b), vtrn2q_p16(a, b)}; } -FUNK int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) { +__funline int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) { return (int8x16x2_t){vtrn1q_s8(a, b), vtrn2q_s8(a, b)}; } -FUNK int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) { +__funline int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) { return (int16x8x2_t){vtrn1q_s16(a, b), vtrn2q_s16(a, b)}; } -FUNK int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) { +__funline int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) { return (int32x4x2_t){vtrn1q_s32(a, b), vtrn2q_s32(a, b)}; } -FUNK uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b) { +__funline uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16x2_t){vtrn1q_u8(a, b), vtrn2q_u8(a, b)}; } -FUNK uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b) { +__funline uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8x2_t){vtrn1q_u16(a, b), vtrn2q_u16(a, b)}; } -FUNK uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b) { +__funline uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4x2_t){vtrn1q_u32(a, b), vtrn2q_u32(a, b)}; } -FUNK uint8x8_t vtst_s8(int8x8_t __a, int8x8_t __b) { +__funline uint8x8_t vtst_s8(int8x8_t __a, int8x8_t __b) { return (uint8x8_t)((__a & __b) != 0); } -FUNK uint16x4_t vtst_s16(int16x4_t __a, int16x4_t __b) { +__funline uint16x4_t vtst_s16(int16x4_t __a, int16x4_t __b) { return (uint16x4_t)((__a & __b) != 0); } -FUNK uint32x2_t vtst_s32(int32x2_t __a, int32x2_t __b) { +__funline uint32x2_t vtst_s32(int32x2_t __a, int32x2_t __b) { return (uint32x2_t)((__a & __b) != 0); } -FUNK uint64x1_t vtst_s64(int64x1_t __a, int64x1_t __b) { +__funline uint64x1_t vtst_s64(int64x1_t __a, int64x1_t __b) { return (uint64x1_t)((__a & __b) != __AARCH64_INT64_C(0)); } -FUNK uint8x8_t vtst_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vtst_u8(uint8x8_t __a, uint8x8_t __b) { return ((__a & __b) != 0); } -FUNK uint16x4_t vtst_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vtst_u16(uint16x4_t __a, uint16x4_t __b) { return ((__a & __b) != 0); } -FUNK uint32x2_t vtst_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vtst_u32(uint32x2_t __a, uint32x2_t __b) { return ((__a & __b) != 0); } -FUNK uint64x1_t vtst_u64(uint64x1_t __a, uint64x1_t __b) { +__funline uint64x1_t vtst_u64(uint64x1_t __a, uint64x1_t __b) { return ((__a & __b) != __AARCH64_UINT64_C(0)); } -FUNK uint8x16_t vtstq_s8(int8x16_t __a, int8x16_t __b) { +__funline uint8x16_t vtstq_s8(int8x16_t __a, int8x16_t __b) { return (uint8x16_t)((__a & __b) != 0); } -FUNK uint16x8_t vtstq_s16(int16x8_t __a, int16x8_t __b) { +__funline uint16x8_t vtstq_s16(int16x8_t __a, int16x8_t __b) { return (uint16x8_t)((__a & __b) != 0); } -FUNK uint32x4_t vtstq_s32(int32x4_t __a, int32x4_t __b) { +__funline uint32x4_t vtstq_s32(int32x4_t __a, int32x4_t __b) { return (uint32x4_t)((__a & __b) != 0); } -FUNK uint64x2_t vtstq_s64(int64x2_t __a, int64x2_t __b) { +__funline uint64x2_t vtstq_s64(int64x2_t __a, int64x2_t __b) { return (uint64x2_t)((__a & __b) != __AARCH64_INT64_C(0)); } -FUNK uint8x16_t vtstq_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vtstq_u8(uint8x16_t __a, uint8x16_t __b) { return ((__a & __b) != 0); } -FUNK uint16x8_t vtstq_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vtstq_u16(uint16x8_t __a, uint16x8_t __b) { return ((__a & __b) != 0); } -FUNK uint32x4_t vtstq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vtstq_u32(uint32x4_t __a, uint32x4_t __b) { return ((__a & __b) != 0); } -FUNK uint64x2_t vtstq_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vtstq_u64(uint64x2_t __a, uint64x2_t __b) { return ((__a & __b) != __AARCH64_UINT64_C(0)); } -FUNK uint64_t vtstd_s64(int64_t __a, int64_t __b) { +__funline uint64_t vtstd_s64(int64_t __a, int64_t __b) { return (__a & __b) ? -1ll : 0ll; } -FUNK uint64_t vtstd_u64(uint64_t __a, uint64_t __b) { +__funline uint64_t vtstd_u64(uint64_t __a, uint64_t __b) { return (__a & __b) ? -1ll : 0ll; } -FUNK int8x8_t vuqadd_s8(int8x8_t __a, uint8x8_t __b) { +__funline int8x8_t vuqadd_s8(int8x8_t __a, uint8x8_t __b) { return __builtin_aarch64_suqaddv8qi_ssu(__a, __b); } -FUNK int16x4_t vuqadd_s16(int16x4_t __a, uint16x4_t __b) { +__funline int16x4_t vuqadd_s16(int16x4_t __a, uint16x4_t __b) { return __builtin_aarch64_suqaddv4hi_ssu(__a, __b); } -FUNK int32x2_t vuqadd_s32(int32x2_t __a, uint32x2_t __b) { +__funline int32x2_t vuqadd_s32(int32x2_t __a, uint32x2_t __b) { return __builtin_aarch64_suqaddv2si_ssu(__a, __b); } -FUNK int64x1_t vuqadd_s64(int64x1_t __a, uint64x1_t __b) { +__funline int64x1_t vuqadd_s64(int64x1_t __a, uint64x1_t __b) { return (int64x1_t){__builtin_aarch64_suqadddi_ssu(__a[0], __b[0])}; } -FUNK int8x16_t vuqaddq_s8(int8x16_t __a, uint8x16_t __b) { +__funline int8x16_t vuqaddq_s8(int8x16_t __a, uint8x16_t __b) { return __builtin_aarch64_suqaddv16qi_ssu(__a, __b); } -FUNK int16x8_t vuqaddq_s16(int16x8_t __a, uint16x8_t __b) { +__funline int16x8_t vuqaddq_s16(int16x8_t __a, uint16x8_t __b) { return __builtin_aarch64_suqaddv8hi_ssu(__a, __b); } -FUNK int32x4_t vuqaddq_s32(int32x4_t __a, uint32x4_t __b) { +__funline int32x4_t vuqaddq_s32(int32x4_t __a, uint32x4_t __b) { return __builtin_aarch64_suqaddv4si_ssu(__a, __b); } -FUNK int64x2_t vuqaddq_s64(int64x2_t __a, uint64x2_t __b) { +__funline int64x2_t vuqaddq_s64(int64x2_t __a, uint64x2_t __b) { return __builtin_aarch64_suqaddv2di_ssu(__a, __b); } -FUNK int8_t vuqaddb_s8(int8_t __a, uint8_t __b) { +__funline int8_t vuqaddb_s8(int8_t __a, uint8_t __b) { return __builtin_aarch64_suqaddqi_ssu(__a, __b); } -FUNK int16_t vuqaddh_s16(int16_t __a, uint16_t __b) { +__funline int16_t vuqaddh_s16(int16_t __a, uint16_t __b) { return __builtin_aarch64_suqaddhi_ssu(__a, __b); } -FUNK int32_t vuqadds_s32(int32_t __a, uint32_t __b) { +__funline int32_t vuqadds_s32(int32_t __a, uint32_t __b) { return __builtin_aarch64_suqaddsi_ssu(__a, __b); } -FUNK int64_t vuqaddd_s64(int64_t __a, uint64_t __b) { +__funline int64_t vuqaddd_s64(int64_t __a, uint64_t __b) { return __builtin_aarch64_suqadddi_ssu(__a, __b); } -#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) \ - FUNK rettype v##op##Q##_##funcsuffix(intype a, intype b) { \ - return (rettype){v##op##1##Q##_##funcsuffix(a, b), \ - v##op##2##Q##_##funcsuffix(a, b)}; \ +#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) \ + __funline rettype v##op##Q##_##funcsuffix(intype a, intype b) { \ + return (rettype){v##op##1##Q##_##funcsuffix(a, b), \ + v##op##2##Q##_##funcsuffix(a, b)}; \ } #define __INTERLEAVE_LIST(op) \ @@ -19966,7 +20087,7 @@ FUNK int64_t vuqaddd_s64(int64_t __a, uint64_t __b) { __DEFINTERLEAVE(op, uint16x8x2_t, uint16x8_t, u16, q) \ __DEFINTERLEAVE(op, uint32x4x2_t, uint32x4_t, u32, q) -FUNK float16x4_t vuzp1_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vuzp1_f16(float16x4_t __a, float16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){5, 7, 1, 3}); #else @@ -19974,7 +20095,7 @@ FUNK float16x4_t vuzp1_f16(float16x4_t __a, float16x4_t __b) { #endif } -FUNK float32x2_t vuzp1_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vuzp1_f32(float32x2_t __a, float32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){3, 1}); #else @@ -19982,7 +20103,7 @@ FUNK float32x2_t vuzp1_f32(float32x2_t __a, float32x2_t __b) { #endif } -FUNK poly8x8_t vuzp1_p8(poly8x8_t __a, poly8x8_t __b) { +__funline poly8x8_t vuzp1_p8(poly8x8_t __a, poly8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){9, 11, 13, 15, 1, 3, 5, 7}); #else @@ -19990,7 +20111,7 @@ FUNK poly8x8_t vuzp1_p8(poly8x8_t __a, poly8x8_t __b) { #endif } -FUNK poly16x4_t vuzp1_p16(poly16x4_t __a, poly16x4_t __b) { +__funline poly16x4_t vuzp1_p16(poly16x4_t __a, poly16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){5, 7, 1, 3}); #else @@ -19998,7 +20119,7 @@ FUNK poly16x4_t vuzp1_p16(poly16x4_t __a, poly16x4_t __b) { #endif } -FUNK int8x8_t vuzp1_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vuzp1_s8(int8x8_t __a, int8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){9, 11, 13, 15, 1, 3, 5, 7}); #else @@ -20006,7 +20127,7 @@ FUNK int8x8_t vuzp1_s8(int8x8_t __a, int8x8_t __b) { #endif } -FUNK int16x4_t vuzp1_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vuzp1_s16(int16x4_t __a, int16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){5, 7, 1, 3}); #else @@ -20014,7 +20135,7 @@ FUNK int16x4_t vuzp1_s16(int16x4_t __a, int16x4_t __b) { #endif } -FUNK int32x2_t vuzp1_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vuzp1_s32(int32x2_t __a, int32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){3, 1}); #else @@ -20022,7 +20143,7 @@ FUNK int32x2_t vuzp1_s32(int32x2_t __a, int32x2_t __b) { #endif } -FUNK uint8x8_t vuzp1_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vuzp1_u8(uint8x8_t __a, uint8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){9, 11, 13, 15, 1, 3, 5, 7}); #else @@ -20030,7 +20151,7 @@ FUNK uint8x8_t vuzp1_u8(uint8x8_t __a, uint8x8_t __b) { #endif } -FUNK uint16x4_t vuzp1_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vuzp1_u16(uint16x4_t __a, uint16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){5, 7, 1, 3}); #else @@ -20038,7 +20159,7 @@ FUNK uint16x4_t vuzp1_u16(uint16x4_t __a, uint16x4_t __b) { #endif } -FUNK uint32x2_t vuzp1_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vuzp1_u32(uint32x2_t __a, uint32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){3, 1}); #else @@ -20046,7 +20167,7 @@ FUNK uint32x2_t vuzp1_u32(uint32x2_t __a, uint32x2_t __b) { #endif } -FUNK float16x8_t vuzp1q_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vuzp1q_f16(float16x8_t __a, float16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){9, 11, 13, 15, 1, 3, 5, 7}); #else @@ -20054,7 +20175,7 @@ FUNK float16x8_t vuzp1q_f16(float16x8_t __a, float16x8_t __b) { #endif } -FUNK float32x4_t vuzp1q_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vuzp1q_f32(float32x4_t __a, float32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){5, 7, 1, 3}); #else @@ -20062,7 +20183,7 @@ FUNK float32x4_t vuzp1q_f32(float32x4_t __a, float32x4_t __b) { #endif } -FUNK float64x2_t vuzp1q_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vuzp1q_f64(float64x2_t __a, float64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){3, 1}); #else @@ -20070,7 +20191,7 @@ FUNK float64x2_t vuzp1q_f64(float64x2_t __a, float64x2_t __b) { #endif } -FUNK poly8x16_t vuzp1q_p8(poly8x16_t __a, poly8x16_t __b) { +__funline poly8x16_t vuzp1q_p8(poly8x16_t __a, poly8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -20082,7 +20203,7 @@ FUNK poly8x16_t vuzp1q_p8(poly8x16_t __a, poly8x16_t __b) { #endif } -FUNK poly16x8_t vuzp1q_p16(poly16x8_t __a, poly16x8_t __b) { +__funline poly16x8_t vuzp1q_p16(poly16x8_t __a, poly16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){9, 11, 13, 15, 1, 3, 5, 7}); #else @@ -20090,7 +20211,7 @@ FUNK poly16x8_t vuzp1q_p16(poly16x8_t __a, poly16x8_t __b) { #endif } -FUNK int8x16_t vuzp1q_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vuzp1q_s8(int8x16_t __a, int8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -20102,7 +20223,7 @@ FUNK int8x16_t vuzp1q_s8(int8x16_t __a, int8x16_t __b) { #endif } -FUNK int16x8_t vuzp1q_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vuzp1q_s16(int16x8_t __a, int16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){9, 11, 13, 15, 1, 3, 5, 7}); #else @@ -20110,7 +20231,7 @@ FUNK int16x8_t vuzp1q_s16(int16x8_t __a, int16x8_t __b) { #endif } -FUNK int32x4_t vuzp1q_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vuzp1q_s32(int32x4_t __a, int32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){5, 7, 1, 3}); #else @@ -20118,7 +20239,7 @@ FUNK int32x4_t vuzp1q_s32(int32x4_t __a, int32x4_t __b) { #endif } -FUNK int64x2_t vuzp1q_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vuzp1q_s64(int64x2_t __a, int64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){3, 1}); #else @@ -20126,7 +20247,7 @@ FUNK int64x2_t vuzp1q_s64(int64x2_t __a, int64x2_t __b) { #endif } -FUNK uint8x16_t vuzp1q_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vuzp1q_u8(uint8x16_t __a, uint8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -20138,7 +20259,7 @@ FUNK uint8x16_t vuzp1q_u8(uint8x16_t __a, uint8x16_t __b) { #endif } -FUNK uint16x8_t vuzp1q_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vuzp1q_u16(uint16x8_t __a, uint16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){9, 11, 13, 15, 1, 3, 5, 7}); #else @@ -20146,7 +20267,7 @@ FUNK uint16x8_t vuzp1q_u16(uint16x8_t __a, uint16x8_t __b) { #endif } -FUNK uint32x4_t vuzp1q_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vuzp1q_u32(uint32x4_t __a, uint32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){5, 7, 1, 3}); #else @@ -20154,7 +20275,7 @@ FUNK uint32x4_t vuzp1q_u32(uint32x4_t __a, uint32x4_t __b) { #endif } -FUNK uint64x2_t vuzp1q_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vuzp1q_u64(uint64x2_t __a, uint64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){3, 1}); #else @@ -20162,7 +20283,7 @@ FUNK uint64x2_t vuzp1q_u64(uint64x2_t __a, uint64x2_t __b) { #endif } -FUNK float16x4_t vuzp2_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vuzp2_f16(float16x4_t __a, float16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 6, 0, 2}); #else @@ -20170,7 +20291,7 @@ FUNK float16x4_t vuzp2_f16(float16x4_t __a, float16x4_t __b) { #endif } -FUNK float32x2_t vuzp2_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vuzp2_f32(float32x2_t __a, float32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){2, 0}); #else @@ -20178,7 +20299,7 @@ FUNK float32x2_t vuzp2_f32(float32x2_t __a, float32x2_t __b) { #endif } -FUNK poly8x8_t vuzp2_p8(poly8x8_t __a, poly8x8_t __b) { +__funline poly8x8_t vuzp2_p8(poly8x8_t __a, poly8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){8, 10, 12, 14, 0, 2, 4, 6}); #else @@ -20186,7 +20307,7 @@ FUNK poly8x8_t vuzp2_p8(poly8x8_t __a, poly8x8_t __b) { #endif } -FUNK poly16x4_t vuzp2_p16(poly16x4_t __a, poly16x4_t __b) { +__funline poly16x4_t vuzp2_p16(poly16x4_t __a, poly16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 6, 0, 2}); #else @@ -20194,7 +20315,7 @@ FUNK poly16x4_t vuzp2_p16(poly16x4_t __a, poly16x4_t __b) { #endif } -FUNK int8x8_t vuzp2_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vuzp2_s8(int8x8_t __a, int8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){8, 10, 12, 14, 0, 2, 4, 6}); #else @@ -20202,7 +20323,7 @@ FUNK int8x8_t vuzp2_s8(int8x8_t __a, int8x8_t __b) { #endif } -FUNK int16x4_t vuzp2_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vuzp2_s16(int16x4_t __a, int16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 6, 0, 2}); #else @@ -20210,7 +20331,7 @@ FUNK int16x4_t vuzp2_s16(int16x4_t __a, int16x4_t __b) { #endif } -FUNK int32x2_t vuzp2_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vuzp2_s32(int32x2_t __a, int32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){2, 0}); #else @@ -20218,7 +20339,7 @@ FUNK int32x2_t vuzp2_s32(int32x2_t __a, int32x2_t __b) { #endif } -FUNK uint8x8_t vuzp2_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vuzp2_u8(uint8x8_t __a, uint8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){8, 10, 12, 14, 0, 2, 4, 6}); #else @@ -20226,7 +20347,7 @@ FUNK uint8x8_t vuzp2_u8(uint8x8_t __a, uint8x8_t __b) { #endif } -FUNK uint16x4_t vuzp2_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vuzp2_u16(uint16x4_t __a, uint16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 6, 0, 2}); #else @@ -20234,7 +20355,7 @@ FUNK uint16x4_t vuzp2_u16(uint16x4_t __a, uint16x4_t __b) { #endif } -FUNK uint32x2_t vuzp2_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vuzp2_u32(uint32x2_t __a, uint32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){2, 0}); #else @@ -20242,7 +20363,7 @@ FUNK uint32x2_t vuzp2_u32(uint32x2_t __a, uint32x2_t __b) { #endif } -FUNK float16x8_t vuzp2q_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vuzp2q_f16(float16x8_t __a, float16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 10, 12, 14, 0, 2, 4, 6}); #else @@ -20250,7 +20371,7 @@ FUNK float16x8_t vuzp2q_f16(float16x8_t __a, float16x8_t __b) { #endif } -FUNK float32x4_t vuzp2q_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vuzp2q_f32(float32x4_t __a, float32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){4, 6, 0, 2}); #else @@ -20258,7 +20379,7 @@ FUNK float32x4_t vuzp2q_f32(float32x4_t __a, float32x4_t __b) { #endif } -FUNK float64x2_t vuzp2q_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vuzp2q_f64(float64x2_t __a, float64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){2, 0}); #else @@ -20266,7 +20387,7 @@ FUNK float64x2_t vuzp2q_f64(float64x2_t __a, float64x2_t __b) { #endif } -FUNK poly8x16_t vuzp2q_p8(poly8x16_t __a, poly8x16_t __b) { +__funline poly8x16_t vuzp2q_p8(poly8x16_t __a, poly8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -20278,7 +20399,7 @@ FUNK poly8x16_t vuzp2q_p8(poly8x16_t __a, poly8x16_t __b) { #endif } -FUNK poly16x8_t vuzp2q_p16(poly16x8_t __a, poly16x8_t __b) { +__funline poly16x8_t vuzp2q_p16(poly16x8_t __a, poly16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 10, 12, 14, 0, 2, 4, 6}); #else @@ -20286,7 +20407,7 @@ FUNK poly16x8_t vuzp2q_p16(poly16x8_t __a, poly16x8_t __b) { #endif } -FUNK int8x16_t vuzp2q_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vuzp2q_s8(int8x16_t __a, int8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -20298,7 +20419,7 @@ FUNK int8x16_t vuzp2q_s8(int8x16_t __a, int8x16_t __b) { #endif } -FUNK int16x8_t vuzp2q_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vuzp2q_s16(int16x8_t __a, int16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 10, 12, 14, 0, 2, 4, 6}); #else @@ -20306,7 +20427,7 @@ FUNK int16x8_t vuzp2q_s16(int16x8_t __a, int16x8_t __b) { #endif } -FUNK int32x4_t vuzp2q_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vuzp2q_s32(int32x4_t __a, int32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){4, 6, 0, 2}); #else @@ -20314,7 +20435,7 @@ FUNK int32x4_t vuzp2q_s32(int32x4_t __a, int32x4_t __b) { #endif } -FUNK int64x2_t vuzp2q_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vuzp2q_s64(int64x2_t __a, int64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){2, 0}); #else @@ -20322,7 +20443,7 @@ FUNK int64x2_t vuzp2q_s64(int64x2_t __a, int64x2_t __b) { #endif } -FUNK uint8x16_t vuzp2q_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vuzp2q_u8(uint8x16_t __a, uint8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -20334,7 +20455,7 @@ FUNK uint8x16_t vuzp2q_u8(uint8x16_t __a, uint8x16_t __b) { #endif } -FUNK uint16x8_t vuzp2q_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vuzp2q_u16(uint16x8_t __a, uint16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 10, 12, 14, 0, 2, 4, 6}); #else @@ -20342,7 +20463,7 @@ FUNK uint16x8_t vuzp2q_u16(uint16x8_t __a, uint16x8_t __b) { #endif } -FUNK uint32x4_t vuzp2q_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vuzp2q_u32(uint32x4_t __a, uint32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){4, 6, 0, 2}); #else @@ -20350,7 +20471,7 @@ FUNK uint32x4_t vuzp2q_u32(uint32x4_t __a, uint32x4_t __b) { #endif } -FUNK uint64x2_t vuzp2q_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vuzp2q_u64(uint64x2_t __a, uint64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){2, 0}); #else @@ -20360,7 +20481,7 @@ FUNK uint64x2_t vuzp2q_u64(uint64x2_t __a, uint64x2_t __b) { __INTERLEAVE_LIST(uzp) -FUNK float16x4_t vzip1_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vzip1_f16(float16x4_t __a, float16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){6, 2, 7, 3}); #else @@ -20368,7 +20489,7 @@ FUNK float16x4_t vzip1_f16(float16x4_t __a, float16x4_t __b) { #endif } -FUNK float32x2_t vzip1_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vzip1_f32(float32x2_t __a, float32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){3, 1}); #else @@ -20376,7 +20497,7 @@ FUNK float32x2_t vzip1_f32(float32x2_t __a, float32x2_t __b) { #endif } -FUNK poly8x8_t vzip1_p8(poly8x8_t __a, poly8x8_t __b) { +__funline poly8x8_t vzip1_p8(poly8x8_t __a, poly8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){12, 4, 13, 5, 14, 6, 15, 7}); #else @@ -20384,7 +20505,7 @@ FUNK poly8x8_t vzip1_p8(poly8x8_t __a, poly8x8_t __b) { #endif } -FUNK poly16x4_t vzip1_p16(poly16x4_t __a, poly16x4_t __b) { +__funline poly16x4_t vzip1_p16(poly16x4_t __a, poly16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){6, 2, 7, 3}); #else @@ -20392,7 +20513,7 @@ FUNK poly16x4_t vzip1_p16(poly16x4_t __a, poly16x4_t __b) { #endif } -FUNK int8x8_t vzip1_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vzip1_s8(int8x8_t __a, int8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){12, 4, 13, 5, 14, 6, 15, 7}); #else @@ -20400,7 +20521,7 @@ FUNK int8x8_t vzip1_s8(int8x8_t __a, int8x8_t __b) { #endif } -FUNK int16x4_t vzip1_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vzip1_s16(int16x4_t __a, int16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){6, 2, 7, 3}); #else @@ -20408,7 +20529,7 @@ FUNK int16x4_t vzip1_s16(int16x4_t __a, int16x4_t __b) { #endif } -FUNK int32x2_t vzip1_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vzip1_s32(int32x2_t __a, int32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){3, 1}); #else @@ -20416,7 +20537,7 @@ FUNK int32x2_t vzip1_s32(int32x2_t __a, int32x2_t __b) { #endif } -FUNK uint8x8_t vzip1_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vzip1_u8(uint8x8_t __a, uint8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){12, 4, 13, 5, 14, 6, 15, 7}); #else @@ -20424,7 +20545,7 @@ FUNK uint8x8_t vzip1_u8(uint8x8_t __a, uint8x8_t __b) { #endif } -FUNK uint16x4_t vzip1_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vzip1_u16(uint16x4_t __a, uint16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){6, 2, 7, 3}); #else @@ -20432,7 +20553,7 @@ FUNK uint16x4_t vzip1_u16(uint16x4_t __a, uint16x4_t __b) { #endif } -FUNK uint32x2_t vzip1_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vzip1_u32(uint32x2_t __a, uint32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){3, 1}); #else @@ -20440,7 +20561,7 @@ FUNK uint32x2_t vzip1_u32(uint32x2_t __a, uint32x2_t __b) { #endif } -FUNK float16x8_t vzip1q_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vzip1q_f16(float16x8_t __a, float16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){12, 4, 13, 5, 14, 6, 15, 7}); #else @@ -20448,7 +20569,7 @@ FUNK float16x8_t vzip1q_f16(float16x8_t __a, float16x8_t __b) { #endif } -FUNK float32x4_t vzip1q_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vzip1q_f32(float32x4_t __a, float32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){6, 2, 7, 3}); #else @@ -20456,7 +20577,7 @@ FUNK float32x4_t vzip1q_f32(float32x4_t __a, float32x4_t __b) { #endif } -FUNK float64x2_t vzip1q_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vzip1q_f64(float64x2_t __a, float64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){3, 1}); #else @@ -20464,7 +20585,7 @@ FUNK float64x2_t vzip1q_f64(float64x2_t __a, float64x2_t __b) { #endif } -FUNK poly8x16_t vzip1q_p8(poly8x16_t __a, poly8x16_t __b) { +__funline poly8x16_t vzip1q_p8(poly8x16_t __a, poly8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x16_t){24, 8, 25, 9, 26, 10, 27, 11, 28, 12, @@ -20476,7 +20597,7 @@ FUNK poly8x16_t vzip1q_p8(poly8x16_t __a, poly8x16_t __b) { #endif } -FUNK poly16x8_t vzip1q_p16(poly16x8_t __a, poly16x8_t __b) { +__funline poly16x8_t vzip1q_p16(poly16x8_t __a, poly16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){12, 4, 13, 5, 14, 6, 15, 7}); #else @@ -20484,7 +20605,7 @@ FUNK poly16x8_t vzip1q_p16(poly16x8_t __a, poly16x8_t __b) { #endif } -FUNK int8x16_t vzip1q_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vzip1q_s8(int8x16_t __a, int8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x16_t){24, 8, 25, 9, 26, 10, 27, 11, 28, 12, @@ -20496,7 +20617,7 @@ FUNK int8x16_t vzip1q_s8(int8x16_t __a, int8x16_t __b) { #endif } -FUNK int16x8_t vzip1q_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vzip1q_s16(int16x8_t __a, int16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){12, 4, 13, 5, 14, 6, 15, 7}); #else @@ -20504,7 +20625,7 @@ FUNK int16x8_t vzip1q_s16(int16x8_t __a, int16x8_t __b) { #endif } -FUNK int32x4_t vzip1q_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vzip1q_s32(int32x4_t __a, int32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){6, 2, 7, 3}); #else @@ -20512,7 +20633,7 @@ FUNK int32x4_t vzip1q_s32(int32x4_t __a, int32x4_t __b) { #endif } -FUNK int64x2_t vzip1q_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vzip1q_s64(int64x2_t __a, int64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){3, 1}); #else @@ -20520,7 +20641,7 @@ FUNK int64x2_t vzip1q_s64(int64x2_t __a, int64x2_t __b) { #endif } -FUNK uint8x16_t vzip1q_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vzip1q_u8(uint8x16_t __a, uint8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x16_t){24, 8, 25, 9, 26, 10, 27, 11, 28, 12, @@ -20532,7 +20653,7 @@ FUNK uint8x16_t vzip1q_u8(uint8x16_t __a, uint8x16_t __b) { #endif } -FUNK uint16x8_t vzip1q_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vzip1q_u16(uint16x8_t __a, uint16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){12, 4, 13, 5, 14, 6, 15, 7}); #else @@ -20540,7 +20661,7 @@ FUNK uint16x8_t vzip1q_u16(uint16x8_t __a, uint16x8_t __b) { #endif } -FUNK uint32x4_t vzip1q_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vzip1q_u32(uint32x4_t __a, uint32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){6, 2, 7, 3}); #else @@ -20548,7 +20669,7 @@ FUNK uint32x4_t vzip1q_u32(uint32x4_t __a, uint32x4_t __b) { #endif } -FUNK uint64x2_t vzip1q_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vzip1q_u64(uint64x2_t __a, uint64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){3, 1}); #else @@ -20556,7 +20677,7 @@ FUNK uint64x2_t vzip1q_u64(uint64x2_t __a, uint64x2_t __b) { #endif } -FUNK float16x4_t vzip2_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vzip2_f16(float16x4_t __a, float16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 0, 5, 1}); #else @@ -20564,7 +20685,7 @@ FUNK float16x4_t vzip2_f16(float16x4_t __a, float16x4_t __b) { #endif } -FUNK float32x2_t vzip2_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vzip2_f32(float32x2_t __a, float32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){2, 0}); #else @@ -20572,7 +20693,7 @@ FUNK float32x2_t vzip2_f32(float32x2_t __a, float32x2_t __b) { #endif } -FUNK poly8x8_t vzip2_p8(poly8x8_t __a, poly8x8_t __b) { +__funline poly8x8_t vzip2_p8(poly8x8_t __a, poly8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){8, 0, 9, 1, 10, 2, 11, 3}); #else @@ -20580,7 +20701,7 @@ FUNK poly8x8_t vzip2_p8(poly8x8_t __a, poly8x8_t __b) { #endif } -FUNK poly16x4_t vzip2_p16(poly16x4_t __a, poly16x4_t __b) { +__funline poly16x4_t vzip2_p16(poly16x4_t __a, poly16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 0, 5, 1}); #else @@ -20588,7 +20709,7 @@ FUNK poly16x4_t vzip2_p16(poly16x4_t __a, poly16x4_t __b) { #endif } -FUNK int8x8_t vzip2_s8(int8x8_t __a, int8x8_t __b) { +__funline int8x8_t vzip2_s8(int8x8_t __a, int8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){8, 0, 9, 1, 10, 2, 11, 3}); #else @@ -20596,7 +20717,7 @@ FUNK int8x8_t vzip2_s8(int8x8_t __a, int8x8_t __b) { #endif } -FUNK int16x4_t vzip2_s16(int16x4_t __a, int16x4_t __b) { +__funline int16x4_t vzip2_s16(int16x4_t __a, int16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 0, 5, 1}); #else @@ -20604,7 +20725,7 @@ FUNK int16x4_t vzip2_s16(int16x4_t __a, int16x4_t __b) { #endif } -FUNK int32x2_t vzip2_s32(int32x2_t __a, int32x2_t __b) { +__funline int32x2_t vzip2_s32(int32x2_t __a, int32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){2, 0}); #else @@ -20612,7 +20733,7 @@ FUNK int32x2_t vzip2_s32(int32x2_t __a, int32x2_t __b) { #endif } -FUNK uint8x8_t vzip2_u8(uint8x8_t __a, uint8x8_t __b) { +__funline uint8x8_t vzip2_u8(uint8x8_t __a, uint8x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint8x8_t){8, 0, 9, 1, 10, 2, 11, 3}); #else @@ -20620,7 +20741,7 @@ FUNK uint8x8_t vzip2_u8(uint8x8_t __a, uint8x8_t __b) { #endif } -FUNK uint16x4_t vzip2_u16(uint16x4_t __a, uint16x4_t __b) { +__funline uint16x4_t vzip2_u16(uint16x4_t __a, uint16x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x4_t){4, 0, 5, 1}); #else @@ -20628,7 +20749,7 @@ FUNK uint16x4_t vzip2_u16(uint16x4_t __a, uint16x4_t __b) { #endif } -FUNK uint32x2_t vzip2_u32(uint32x2_t __a, uint32x2_t __b) { +__funline uint32x2_t vzip2_u32(uint32x2_t __a, uint32x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x2_t){2, 0}); #else @@ -20636,7 +20757,7 @@ FUNK uint32x2_t vzip2_u32(uint32x2_t __a, uint32x2_t __b) { #endif } -FUNK float16x8_t vzip2q_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vzip2q_f16(float16x8_t __a, float16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 0, 9, 1, 10, 2, 11, 3}); #else @@ -20644,7 +20765,7 @@ FUNK float16x8_t vzip2q_f16(float16x8_t __a, float16x8_t __b) { #endif } -FUNK float32x4_t vzip2q_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vzip2q_f32(float32x4_t __a, float32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){4, 0, 5, 1}); #else @@ -20652,7 +20773,7 @@ FUNK float32x4_t vzip2q_f32(float32x4_t __a, float32x4_t __b) { #endif } -FUNK float64x2_t vzip2q_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vzip2q_f64(float64x2_t __a, float64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){2, 0}); #else @@ -20660,7 +20781,7 @@ FUNK float64x2_t vzip2q_f64(float64x2_t __a, float64x2_t __b) { #endif } -FUNK poly8x16_t vzip2q_p8(poly8x16_t __a, poly8x16_t __b) { +__funline poly8x16_t vzip2q_p8(poly8x16_t __a, poly8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -20672,7 +20793,7 @@ FUNK poly8x16_t vzip2q_p8(poly8x16_t __a, poly8x16_t __b) { #endif } -FUNK poly16x8_t vzip2q_p16(poly16x8_t __a, poly16x8_t __b) { +__funline poly16x8_t vzip2q_p16(poly16x8_t __a, poly16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 0, 9, 1, 10, 2, 11, 3}); #else @@ -20680,7 +20801,7 @@ FUNK poly16x8_t vzip2q_p16(poly16x8_t __a, poly16x8_t __b) { #endif } -FUNK int8x16_t vzip2q_s8(int8x16_t __a, int8x16_t __b) { +__funline int8x16_t vzip2q_s8(int8x16_t __a, int8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -20692,7 +20813,7 @@ FUNK int8x16_t vzip2q_s8(int8x16_t __a, int8x16_t __b) { #endif } -FUNK int16x8_t vzip2q_s16(int16x8_t __a, int16x8_t __b) { +__funline int16x8_t vzip2q_s16(int16x8_t __a, int16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 0, 9, 1, 10, 2, 11, 3}); #else @@ -20700,7 +20821,7 @@ FUNK int16x8_t vzip2q_s16(int16x8_t __a, int16x8_t __b) { #endif } -FUNK int32x4_t vzip2q_s32(int32x4_t __a, int32x4_t __b) { +__funline int32x4_t vzip2q_s32(int32x4_t __a, int32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){4, 0, 5, 1}); #else @@ -20708,7 +20829,7 @@ FUNK int32x4_t vzip2q_s32(int32x4_t __a, int32x4_t __b) { #endif } -FUNK int64x2_t vzip2q_s64(int64x2_t __a, int64x2_t __b) { +__funline int64x2_t vzip2q_s64(int64x2_t __a, int64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){2, 0}); #else @@ -20716,7 +20837,7 @@ FUNK int64x2_t vzip2q_s64(int64x2_t __a, int64x2_t __b) { #endif } -FUNK uint8x16_t vzip2q_u8(uint8x16_t __a, uint8x16_t __b) { +__funline uint8x16_t vzip2q_u8(uint8x16_t __a, uint8x16_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle( __a, __b, @@ -20728,7 +20849,7 @@ FUNK uint8x16_t vzip2q_u8(uint8x16_t __a, uint8x16_t __b) { #endif } -FUNK uint16x8_t vzip2q_u16(uint16x8_t __a, uint16x8_t __b) { +__funline uint16x8_t vzip2q_u16(uint16x8_t __a, uint16x8_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint16x8_t){8, 0, 9, 1, 10, 2, 11, 3}); #else @@ -20736,7 +20857,7 @@ FUNK uint16x8_t vzip2q_u16(uint16x8_t __a, uint16x8_t __b) { #endif } -FUNK uint32x4_t vzip2q_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vzip2q_u32(uint32x4_t __a, uint32x4_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint32x4_t){4, 0, 5, 1}); #else @@ -20744,7 +20865,7 @@ FUNK uint32x4_t vzip2q_u32(uint32x4_t __a, uint32x4_t __b) { #endif } -FUNK uint64x2_t vzip2q_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vzip2q_u64(uint64x2_t __a, uint64x2_t __b) { #ifdef __AARCH64EB__ return __builtin_shuffle(__a, __b, (uint64x2_t){2, 0}); #else @@ -20764,675 +20885,683 @@ __INTERLEAVE_LIST(zip) #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+fp16") -FUNK float16x4_t vabs_f16(float16x4_t __a) { +__funline float16x4_t vabs_f16(float16x4_t __a) { return __builtin_aarch64_absv4hf(__a); } -FUNK float16x8_t vabsq_f16(float16x8_t __a) { +__funline float16x8_t vabsq_f16(float16x8_t __a) { return __builtin_aarch64_absv8hf(__a); } -FUNK uint16x4_t vceqz_f16(float16x4_t __a) { +__funline uint16x4_t vceqz_f16(float16x4_t __a) { return __builtin_aarch64_cmeqv4hf_uss(__a, vdup_n_f16(0.0f)); } -FUNK uint16x8_t vceqzq_f16(float16x8_t __a) { +__funline uint16x8_t vceqzq_f16(float16x8_t __a) { return __builtin_aarch64_cmeqv8hf_uss(__a, vdupq_n_f16(0.0f)); } -FUNK uint16x4_t vcgez_f16(float16x4_t __a) { +__funline uint16x4_t vcgez_f16(float16x4_t __a) { return __builtin_aarch64_cmgev4hf_uss(__a, vdup_n_f16(0.0f)); } -FUNK uint16x8_t vcgezq_f16(float16x8_t __a) { +__funline uint16x8_t vcgezq_f16(float16x8_t __a) { return __builtin_aarch64_cmgev8hf_uss(__a, vdupq_n_f16(0.0f)); } -FUNK uint16x4_t vcgtz_f16(float16x4_t __a) { +__funline uint16x4_t vcgtz_f16(float16x4_t __a) { return __builtin_aarch64_cmgtv4hf_uss(__a, vdup_n_f16(0.0f)); } -FUNK uint16x8_t vcgtzq_f16(float16x8_t __a) { +__funline uint16x8_t vcgtzq_f16(float16x8_t __a) { return __builtin_aarch64_cmgtv8hf_uss(__a, vdupq_n_f16(0.0f)); } -FUNK uint16x4_t vclez_f16(float16x4_t __a) { +__funline uint16x4_t vclez_f16(float16x4_t __a) { return __builtin_aarch64_cmlev4hf_uss(__a, vdup_n_f16(0.0f)); } -FUNK uint16x8_t vclezq_f16(float16x8_t __a) { +__funline uint16x8_t vclezq_f16(float16x8_t __a) { return __builtin_aarch64_cmlev8hf_uss(__a, vdupq_n_f16(0.0f)); } -FUNK uint16x4_t vcltz_f16(float16x4_t __a) { +__funline uint16x4_t vcltz_f16(float16x4_t __a) { return __builtin_aarch64_cmltv4hf_uss(__a, vdup_n_f16(0.0f)); } -FUNK uint16x8_t vcltzq_f16(float16x8_t __a) { +__funline uint16x8_t vcltzq_f16(float16x8_t __a) { return __builtin_aarch64_cmltv8hf_uss(__a, vdupq_n_f16(0.0f)); } -FUNK float16x4_t vcvt_f16_s16(int16x4_t __a) { +__funline float16x4_t vcvt_f16_s16(int16x4_t __a) { return __builtin_aarch64_floatv4hiv4hf(__a); } -FUNK float16x8_t vcvtq_f16_s16(int16x8_t __a) { +__funline float16x8_t vcvtq_f16_s16(int16x8_t __a) { return __builtin_aarch64_floatv8hiv8hf(__a); } -FUNK float16x4_t vcvt_f16_u16(uint16x4_t __a) { +__funline float16x4_t vcvt_f16_u16(uint16x4_t __a) { return __builtin_aarch64_floatunsv4hiv4hf((int16x4_t)__a); } -FUNK float16x8_t vcvtq_f16_u16(uint16x8_t __a) { +__funline float16x8_t vcvtq_f16_u16(uint16x8_t __a) { return __builtin_aarch64_floatunsv8hiv8hf((int16x8_t)__a); } -FUNK int16x4_t vcvt_s16_f16(float16x4_t __a) { +__funline int16x4_t vcvt_s16_f16(float16x4_t __a) { return __builtin_aarch64_lbtruncv4hfv4hi(__a); } -FUNK int16x8_t vcvtq_s16_f16(float16x8_t __a) { +__funline int16x8_t vcvtq_s16_f16(float16x8_t __a) { return __builtin_aarch64_lbtruncv8hfv8hi(__a); } -FUNK uint16x4_t vcvt_u16_f16(float16x4_t __a) { +__funline uint16x4_t vcvt_u16_f16(float16x4_t __a) { return __builtin_aarch64_lbtruncuv4hfv4hi_us(__a); } -FUNK uint16x8_t vcvtq_u16_f16(float16x8_t __a) { +__funline uint16x8_t vcvtq_u16_f16(float16x8_t __a) { return __builtin_aarch64_lbtruncuv8hfv8hi_us(__a); } -FUNK int16x4_t vcvta_s16_f16(float16x4_t __a) { +__funline int16x4_t vcvta_s16_f16(float16x4_t __a) { return __builtin_aarch64_lroundv4hfv4hi(__a); } -FUNK int16x8_t vcvtaq_s16_f16(float16x8_t __a) { +__funline int16x8_t vcvtaq_s16_f16(float16x8_t __a) { return __builtin_aarch64_lroundv8hfv8hi(__a); } -FUNK uint16x4_t vcvta_u16_f16(float16x4_t __a) { +__funline uint16x4_t vcvta_u16_f16(float16x4_t __a) { return __builtin_aarch64_lrounduv4hfv4hi_us(__a); } -FUNK uint16x8_t vcvtaq_u16_f16(float16x8_t __a) { +__funline uint16x8_t vcvtaq_u16_f16(float16x8_t __a) { return __builtin_aarch64_lrounduv8hfv8hi_us(__a); } -FUNK int16x4_t vcvtm_s16_f16(float16x4_t __a) { +__funline int16x4_t vcvtm_s16_f16(float16x4_t __a) { return __builtin_aarch64_lfloorv4hfv4hi(__a); } -FUNK int16x8_t vcvtmq_s16_f16(float16x8_t __a) { +__funline int16x8_t vcvtmq_s16_f16(float16x8_t __a) { return __builtin_aarch64_lfloorv8hfv8hi(__a); } -FUNK uint16x4_t vcvtm_u16_f16(float16x4_t __a) { +__funline uint16x4_t vcvtm_u16_f16(float16x4_t __a) { return __builtin_aarch64_lflooruv4hfv4hi_us(__a); } -FUNK uint16x8_t vcvtmq_u16_f16(float16x8_t __a) { +__funline uint16x8_t vcvtmq_u16_f16(float16x8_t __a) { return __builtin_aarch64_lflooruv8hfv8hi_us(__a); } -FUNK int16x4_t vcvtn_s16_f16(float16x4_t __a) { +__funline int16x4_t vcvtn_s16_f16(float16x4_t __a) { return __builtin_aarch64_lfrintnv4hfv4hi(__a); } -FUNK int16x8_t vcvtnq_s16_f16(float16x8_t __a) { +__funline int16x8_t vcvtnq_s16_f16(float16x8_t __a) { return __builtin_aarch64_lfrintnv8hfv8hi(__a); } -FUNK uint16x4_t vcvtn_u16_f16(float16x4_t __a) { +__funline uint16x4_t vcvtn_u16_f16(float16x4_t __a) { return __builtin_aarch64_lfrintnuv4hfv4hi_us(__a); } -FUNK uint16x8_t vcvtnq_u16_f16(float16x8_t __a) { +__funline uint16x8_t vcvtnq_u16_f16(float16x8_t __a) { return __builtin_aarch64_lfrintnuv8hfv8hi_us(__a); } -FUNK int16x4_t vcvtp_s16_f16(float16x4_t __a) { +__funline int16x4_t vcvtp_s16_f16(float16x4_t __a) { return __builtin_aarch64_lceilv4hfv4hi(__a); } -FUNK int16x8_t vcvtpq_s16_f16(float16x8_t __a) { +__funline int16x8_t vcvtpq_s16_f16(float16x8_t __a) { return __builtin_aarch64_lceilv8hfv8hi(__a); } -FUNK uint16x4_t vcvtp_u16_f16(float16x4_t __a) { +__funline uint16x4_t vcvtp_u16_f16(float16x4_t __a) { return __builtin_aarch64_lceiluv4hfv4hi_us(__a); } -FUNK uint16x8_t vcvtpq_u16_f16(float16x8_t __a) { +__funline uint16x8_t vcvtpq_u16_f16(float16x8_t __a) { return __builtin_aarch64_lceiluv8hfv8hi_us(__a); } -FUNK float16x4_t vneg_f16(float16x4_t __a) { +__funline float16x4_t vneg_f16(float16x4_t __a) { return -__a; } -FUNK float16x8_t vnegq_f16(float16x8_t __a) { +__funline float16x8_t vnegq_f16(float16x8_t __a) { return -__a; } -FUNK float16x4_t vrecpe_f16(float16x4_t __a) { +__funline float16x4_t vrecpe_f16(float16x4_t __a) { return __builtin_aarch64_frecpev4hf(__a); } -FUNK float16x8_t vrecpeq_f16(float16x8_t __a) { +__funline float16x8_t vrecpeq_f16(float16x8_t __a) { return __builtin_aarch64_frecpev8hf(__a); } -FUNK float16x4_t vrnd_f16(float16x4_t __a) { +__funline float16x4_t vrnd_f16(float16x4_t __a) { return __builtin_aarch64_btruncv4hf(__a); } -FUNK float16x8_t vrndq_f16(float16x8_t __a) { +__funline float16x8_t vrndq_f16(float16x8_t __a) { return __builtin_aarch64_btruncv8hf(__a); } -FUNK float16x4_t vrnda_f16(float16x4_t __a) { +__funline float16x4_t vrnda_f16(float16x4_t __a) { return __builtin_aarch64_roundv4hf(__a); } -FUNK float16x8_t vrndaq_f16(float16x8_t __a) { +__funline float16x8_t vrndaq_f16(float16x8_t __a) { return __builtin_aarch64_roundv8hf(__a); } -FUNK float16x4_t vrndi_f16(float16x4_t __a) { +__funline float16x4_t vrndi_f16(float16x4_t __a) { return __builtin_aarch64_nearbyintv4hf(__a); } -FUNK float16x8_t vrndiq_f16(float16x8_t __a) { +__funline float16x8_t vrndiq_f16(float16x8_t __a) { return __builtin_aarch64_nearbyintv8hf(__a); } -FUNK float16x4_t vrndm_f16(float16x4_t __a) { +__funline float16x4_t vrndm_f16(float16x4_t __a) { return __builtin_aarch64_floorv4hf(__a); } -FUNK float16x8_t vrndmq_f16(float16x8_t __a) { +__funline float16x8_t vrndmq_f16(float16x8_t __a) { return __builtin_aarch64_floorv8hf(__a); } -FUNK float16x4_t vrndn_f16(float16x4_t __a) { +__funline float16x4_t vrndn_f16(float16x4_t __a) { return __builtin_aarch64_frintnv4hf(__a); } -FUNK float16x8_t vrndnq_f16(float16x8_t __a) { +__funline float16x8_t vrndnq_f16(float16x8_t __a) { return __builtin_aarch64_frintnv8hf(__a); } -FUNK float16x4_t vrndp_f16(float16x4_t __a) { +__funline float16x4_t vrndp_f16(float16x4_t __a) { return __builtin_aarch64_ceilv4hf(__a); } -FUNK float16x8_t vrndpq_f16(float16x8_t __a) { +__funline float16x8_t vrndpq_f16(float16x8_t __a) { return __builtin_aarch64_ceilv8hf(__a); } -FUNK float16x4_t vrndx_f16(float16x4_t __a) { +__funline float16x4_t vrndx_f16(float16x4_t __a) { return __builtin_aarch64_rintv4hf(__a); } -FUNK float16x8_t vrndxq_f16(float16x8_t __a) { +__funline float16x8_t vrndxq_f16(float16x8_t __a) { return __builtin_aarch64_rintv8hf(__a); } -FUNK float16x4_t vrsqrte_f16(float16x4_t a) { +__funline float16x4_t vrsqrte_f16(float16x4_t a) { return __builtin_aarch64_rsqrtev4hf(a); } -FUNK float16x8_t vrsqrteq_f16(float16x8_t a) { +__funline float16x8_t vrsqrteq_f16(float16x8_t a) { return __builtin_aarch64_rsqrtev8hf(a); } -FUNK float16x4_t vsqrt_f16(float16x4_t a) { +__funline float16x4_t vsqrt_f16(float16x4_t a) { return __builtin_aarch64_sqrtv4hf(a); } -FUNK float16x8_t vsqrtq_f16(float16x8_t a) { +__funline float16x8_t vsqrtq_f16(float16x8_t a) { return __builtin_aarch64_sqrtv8hf(a); } -FUNK float16x4_t vadd_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vadd_f16(float16x4_t __a, float16x4_t __b) { return __a + __b; } -FUNK float16x8_t vaddq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vaddq_f16(float16x8_t __a, float16x8_t __b) { return __a + __b; } -FUNK float16x4_t vabd_f16(float16x4_t a, float16x4_t b) { +__funline float16x4_t vabd_f16(float16x4_t a, float16x4_t b) { return __builtin_aarch64_fabdv4hf(a, b); } -FUNK float16x8_t vabdq_f16(float16x8_t a, float16x8_t b) { +__funline float16x8_t vabdq_f16(float16x8_t a, float16x8_t b) { return __builtin_aarch64_fabdv8hf(a, b); } -FUNK uint16x4_t vcage_f16(float16x4_t __a, float16x4_t __b) { +__funline uint16x4_t vcage_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_facgev4hf_uss(__a, __b); } -FUNK uint16x8_t vcageq_f16(float16x8_t __a, float16x8_t __b) { +__funline uint16x8_t vcageq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_facgev8hf_uss(__a, __b); } -FUNK uint16x4_t vcagt_f16(float16x4_t __a, float16x4_t __b) { +__funline uint16x4_t vcagt_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_facgtv4hf_uss(__a, __b); } -FUNK uint16x8_t vcagtq_f16(float16x8_t __a, float16x8_t __b) { +__funline uint16x8_t vcagtq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_facgtv8hf_uss(__a, __b); } -FUNK uint16x4_t vcale_f16(float16x4_t __a, float16x4_t __b) { +__funline uint16x4_t vcale_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_faclev4hf_uss(__a, __b); } -FUNK uint16x8_t vcaleq_f16(float16x8_t __a, float16x8_t __b) { +__funline uint16x8_t vcaleq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_faclev8hf_uss(__a, __b); } -FUNK uint16x4_t vcalt_f16(float16x4_t __a, float16x4_t __b) { +__funline uint16x4_t vcalt_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_facltv4hf_uss(__a, __b); } -FUNK uint16x8_t vcaltq_f16(float16x8_t __a, float16x8_t __b) { +__funline uint16x8_t vcaltq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_facltv8hf_uss(__a, __b); } -FUNK uint16x4_t vceq_f16(float16x4_t __a, float16x4_t __b) { +__funline uint16x4_t vceq_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_cmeqv4hf_uss(__a, __b); } -FUNK uint16x8_t vceqq_f16(float16x8_t __a, float16x8_t __b) { +__funline uint16x8_t vceqq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_cmeqv8hf_uss(__a, __b); } -FUNK uint16x4_t vcge_f16(float16x4_t __a, float16x4_t __b) { +__funline uint16x4_t vcge_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_cmgev4hf_uss(__a, __b); } -FUNK uint16x8_t vcgeq_f16(float16x8_t __a, float16x8_t __b) { +__funline uint16x8_t vcgeq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_cmgev8hf_uss(__a, __b); } -FUNK uint16x4_t vcgt_f16(float16x4_t __a, float16x4_t __b) { +__funline uint16x4_t vcgt_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_cmgtv4hf_uss(__a, __b); } -FUNK uint16x8_t vcgtq_f16(float16x8_t __a, float16x8_t __b) { +__funline uint16x8_t vcgtq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_cmgtv8hf_uss(__a, __b); } -FUNK uint16x4_t vcle_f16(float16x4_t __a, float16x4_t __b) { +__funline uint16x4_t vcle_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_cmlev4hf_uss(__a, __b); } -FUNK uint16x8_t vcleq_f16(float16x8_t __a, float16x8_t __b) { +__funline uint16x8_t vcleq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_cmlev8hf_uss(__a, __b); } -FUNK uint16x4_t vclt_f16(float16x4_t __a, float16x4_t __b) { +__funline uint16x4_t vclt_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_cmltv4hf_uss(__a, __b); } -FUNK uint16x8_t vcltq_f16(float16x8_t __a, float16x8_t __b) { +__funline uint16x8_t vcltq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_cmltv8hf_uss(__a, __b); } -FUNK float16x4_t vcvt_n_f16_s16(int16x4_t __a, const int __b) { +__funline float16x4_t vcvt_n_f16_s16(int16x4_t __a, const int __b) { return __builtin_aarch64_scvtfv4hi(__a, __b); } -FUNK float16x8_t vcvtq_n_f16_s16(int16x8_t __a, const int __b) { +__funline float16x8_t vcvtq_n_f16_s16(int16x8_t __a, const int __b) { return __builtin_aarch64_scvtfv8hi(__a, __b); } -FUNK float16x4_t vcvt_n_f16_u16(uint16x4_t __a, const int __b) { +__funline float16x4_t vcvt_n_f16_u16(uint16x4_t __a, const int __b) { return __builtin_aarch64_ucvtfv4hi_sus(__a, __b); } -FUNK float16x8_t vcvtq_n_f16_u16(uint16x8_t __a, const int __b) { +__funline float16x8_t vcvtq_n_f16_u16(uint16x8_t __a, const int __b) { return __builtin_aarch64_ucvtfv8hi_sus(__a, __b); } -FUNK int16x4_t vcvt_n_s16_f16(float16x4_t __a, const int __b) { +__funline int16x4_t vcvt_n_s16_f16(float16x4_t __a, const int __b) { return __builtin_aarch64_fcvtzsv4hf(__a, __b); } -FUNK int16x8_t vcvtq_n_s16_f16(float16x8_t __a, const int __b) { +__funline int16x8_t vcvtq_n_s16_f16(float16x8_t __a, const int __b) { return __builtin_aarch64_fcvtzsv8hf(__a, __b); } -FUNK uint16x4_t vcvt_n_u16_f16(float16x4_t __a, const int __b) { +__funline uint16x4_t vcvt_n_u16_f16(float16x4_t __a, const int __b) { return __builtin_aarch64_fcvtzuv4hf_uss(__a, __b); } -FUNK uint16x8_t vcvtq_n_u16_f16(float16x8_t __a, const int __b) { +__funline uint16x8_t vcvtq_n_u16_f16(float16x8_t __a, const int __b) { return __builtin_aarch64_fcvtzuv8hf_uss(__a, __b); } -FUNK float16x4_t vdiv_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vdiv_f16(float16x4_t __a, float16x4_t __b) { return __a / __b; } -FUNK float16x8_t vdivq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vdivq_f16(float16x8_t __a, float16x8_t __b) { return __a / __b; } -FUNK float16x4_t vmax_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vmax_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_smax_nanv4hf(__a, __b); } -FUNK float16x8_t vmaxq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vmaxq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_smax_nanv8hf(__a, __b); } -FUNK float16x4_t vmaxnm_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vmaxnm_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_fmaxv4hf(__a, __b); } -FUNK float16x8_t vmaxnmq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vmaxnmq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_fmaxv8hf(__a, __b); } -FUNK float16x4_t vmin_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vmin_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_smin_nanv4hf(__a, __b); } -FUNK float16x8_t vminq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vminq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_smin_nanv8hf(__a, __b); } -FUNK float16x4_t vminnm_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vminnm_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_fminv4hf(__a, __b); } -FUNK float16x8_t vminnmq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vminnmq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_fminv8hf(__a, __b); } -FUNK float16x4_t vmul_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vmul_f16(float16x4_t __a, float16x4_t __b) { return __a * __b; } -FUNK float16x8_t vmulq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vmulq_f16(float16x8_t __a, float16x8_t __b) { return __a * __b; } -FUNK float16x4_t vmulx_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vmulx_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_fmulxv4hf(__a, __b); } -FUNK float16x8_t vmulxq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vmulxq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_fmulxv8hf(__a, __b); } -FUNK float16x4_t vpadd_f16(float16x4_t a, float16x4_t b) { +__funline float16x4_t vpadd_f16(float16x4_t a, float16x4_t b) { return __builtin_aarch64_faddpv4hf(a, b); } -FUNK float16x8_t vpaddq_f16(float16x8_t a, float16x8_t b) { +__funline float16x8_t vpaddq_f16(float16x8_t a, float16x8_t b) { return __builtin_aarch64_faddpv8hf(a, b); } -FUNK float16x4_t vpmax_f16(float16x4_t a, float16x4_t b) { +__funline float16x4_t vpmax_f16(float16x4_t a, float16x4_t b) { return __builtin_aarch64_smax_nanpv4hf(a, b); } -FUNK float16x8_t vpmaxq_f16(float16x8_t a, float16x8_t b) { +__funline float16x8_t vpmaxq_f16(float16x8_t a, float16x8_t b) { return __builtin_aarch64_smax_nanpv8hf(a, b); } -FUNK float16x4_t vpmaxnm_f16(float16x4_t a, float16x4_t b) { +__funline float16x4_t vpmaxnm_f16(float16x4_t a, float16x4_t b) { return __builtin_aarch64_smaxpv4hf(a, b); } -FUNK float16x8_t vpmaxnmq_f16(float16x8_t a, float16x8_t b) { +__funline float16x8_t vpmaxnmq_f16(float16x8_t a, float16x8_t b) { return __builtin_aarch64_smaxpv8hf(a, b); } -FUNK float16x4_t vpmin_f16(float16x4_t a, float16x4_t b) { +__funline float16x4_t vpmin_f16(float16x4_t a, float16x4_t b) { return __builtin_aarch64_smin_nanpv4hf(a, b); } -FUNK float16x8_t vpminq_f16(float16x8_t a, float16x8_t b) { +__funline float16x8_t vpminq_f16(float16x8_t a, float16x8_t b) { return __builtin_aarch64_smin_nanpv8hf(a, b); } -FUNK float16x4_t vpminnm_f16(float16x4_t a, float16x4_t b) { +__funline float16x4_t vpminnm_f16(float16x4_t a, float16x4_t b) { return __builtin_aarch64_sminpv4hf(a, b); } -FUNK float16x8_t vpminnmq_f16(float16x8_t a, float16x8_t b) { +__funline float16x8_t vpminnmq_f16(float16x8_t a, float16x8_t b) { return __builtin_aarch64_sminpv8hf(a, b); } -FUNK float16x4_t vrecps_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vrecps_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_frecpsv4hf(__a, __b); } -FUNK float16x8_t vrecpsq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vrecpsq_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_frecpsv8hf(__a, __b); } -FUNK float16x4_t vrsqrts_f16(float16x4_t a, float16x4_t b) { +__funline float16x4_t vrsqrts_f16(float16x4_t a, float16x4_t b) { return __builtin_aarch64_rsqrtsv4hf(a, b); } -FUNK float16x8_t vrsqrtsq_f16(float16x8_t a, float16x8_t b) { +__funline float16x8_t vrsqrtsq_f16(float16x8_t a, float16x8_t b) { return __builtin_aarch64_rsqrtsv8hf(a, b); } -FUNK float16x4_t vsub_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vsub_f16(float16x4_t __a, float16x4_t __b) { return __a - __b; } -FUNK float16x8_t vsubq_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vsubq_f16(float16x8_t __a, float16x8_t __b) { return __a - __b; } -FUNK float16x4_t vfma_f16(float16x4_t __a, float16x4_t __b, float16x4_t __c) { +__funline float16x4_t vfma_f16(float16x4_t __a, float16x4_t __b, + float16x4_t __c) { return __builtin_aarch64_fmav4hf(__b, __c, __a); } -FUNK float16x8_t vfmaq_f16(float16x8_t __a, float16x8_t __b, float16x8_t __c) { +__funline float16x8_t vfmaq_f16(float16x8_t __a, float16x8_t __b, + float16x8_t __c) { return __builtin_aarch64_fmav8hf(__b, __c, __a); } -FUNK float16x4_t vfms_f16(float16x4_t __a, float16x4_t __b, float16x4_t __c) { +__funline float16x4_t vfms_f16(float16x4_t __a, float16x4_t __b, + float16x4_t __c) { return __builtin_aarch64_fnmav4hf(__b, __c, __a); } -FUNK float16x8_t vfmsq_f16(float16x8_t __a, float16x8_t __b, float16x8_t __c) { +__funline float16x8_t vfmsq_f16(float16x8_t __a, float16x8_t __b, + float16x8_t __c) { return __builtin_aarch64_fnmav8hf(__b, __c, __a); } -FUNK float16_t vfmah_lane_f16(float16_t __a, float16_t __b, float16x4_t __c, - const int __lane) { +__funline float16_t vfmah_lane_f16(float16_t __a, float16_t __b, + float16x4_t __c, const int __lane) { return vfmah_f16(__a, __b, __aarch64_vget_lane_any(__c, __lane)); } -FUNK float16_t vfmah_laneq_f16(float16_t __a, float16_t __b, float16x8_t __c, - const int __lane) { +__funline float16_t vfmah_laneq_f16(float16_t __a, float16_t __b, + float16x8_t __c, const int __lane) { return vfmah_f16(__a, __b, __aarch64_vget_lane_any(__c, __lane)); } -FUNK float16x4_t vfma_lane_f16(float16x4_t __a, float16x4_t __b, - float16x4_t __c, const int __lane) { +__funline float16x4_t vfma_lane_f16(float16x4_t __a, float16x4_t __b, + float16x4_t __c, const int __lane) { return vfma_f16(__a, __b, __aarch64_vdup_lane_f16(__c, __lane)); } -FUNK float16x8_t vfmaq_lane_f16(float16x8_t __a, float16x8_t __b, - float16x4_t __c, const int __lane) { +__funline float16x8_t vfmaq_lane_f16(float16x8_t __a, float16x8_t __b, + float16x4_t __c, const int __lane) { return vfmaq_f16(__a, __b, __aarch64_vdupq_lane_f16(__c, __lane)); } -FUNK float16x4_t vfma_laneq_f16(float16x4_t __a, float16x4_t __b, - float16x8_t __c, const int __lane) { +__funline float16x4_t vfma_laneq_f16(float16x4_t __a, float16x4_t __b, + float16x8_t __c, const int __lane) { return vfma_f16(__a, __b, __aarch64_vdup_laneq_f16(__c, __lane)); } -FUNK float16x8_t vfmaq_laneq_f16(float16x8_t __a, float16x8_t __b, - float16x8_t __c, const int __lane) { +__funline float16x8_t vfmaq_laneq_f16(float16x8_t __a, float16x8_t __b, + float16x8_t __c, const int __lane) { return vfmaq_f16(__a, __b, __aarch64_vdupq_laneq_f16(__c, __lane)); } -FUNK float16x4_t vfma_n_f16(float16x4_t __a, float16x4_t __b, float16_t __c) { +__funline float16x4_t vfma_n_f16(float16x4_t __a, float16x4_t __b, + float16_t __c) { return vfma_f16(__a, __b, vdup_n_f16(__c)); } -FUNK float16x8_t vfmaq_n_f16(float16x8_t __a, float16x8_t __b, float16_t __c) { +__funline float16x8_t vfmaq_n_f16(float16x8_t __a, float16x8_t __b, + float16_t __c) { return vfmaq_f16(__a, __b, vdupq_n_f16(__c)); } -FUNK float16_t vfmsh_lane_f16(float16_t __a, float16_t __b, float16x4_t __c, - const int __lane) { +__funline float16_t vfmsh_lane_f16(float16_t __a, float16_t __b, + float16x4_t __c, const int __lane) { return vfmsh_f16(__a, __b, __aarch64_vget_lane_any(__c, __lane)); } -FUNK float16_t vfmsh_laneq_f16(float16_t __a, float16_t __b, float16x8_t __c, - const int __lane) { +__funline float16_t vfmsh_laneq_f16(float16_t __a, float16_t __b, + float16x8_t __c, const int __lane) { return vfmsh_f16(__a, __b, __aarch64_vget_lane_any(__c, __lane)); } -FUNK float16x4_t vfms_lane_f16(float16x4_t __a, float16x4_t __b, - float16x4_t __c, const int __lane) { +__funline float16x4_t vfms_lane_f16(float16x4_t __a, float16x4_t __b, + float16x4_t __c, const int __lane) { return vfms_f16(__a, __b, __aarch64_vdup_lane_f16(__c, __lane)); } -FUNK float16x8_t vfmsq_lane_f16(float16x8_t __a, float16x8_t __b, - float16x4_t __c, const int __lane) { +__funline float16x8_t vfmsq_lane_f16(float16x8_t __a, float16x8_t __b, + float16x4_t __c, const int __lane) { return vfmsq_f16(__a, __b, __aarch64_vdupq_lane_f16(__c, __lane)); } -FUNK float16x4_t vfms_laneq_f16(float16x4_t __a, float16x4_t __b, - float16x8_t __c, const int __lane) { +__funline float16x4_t vfms_laneq_f16(float16x4_t __a, float16x4_t __b, + float16x8_t __c, const int __lane) { return vfms_f16(__a, __b, __aarch64_vdup_laneq_f16(__c, __lane)); } -FUNK float16x8_t vfmsq_laneq_f16(float16x8_t __a, float16x8_t __b, - float16x8_t __c, const int __lane) { +__funline float16x8_t vfmsq_laneq_f16(float16x8_t __a, float16x8_t __b, + float16x8_t __c, const int __lane) { return vfmsq_f16(__a, __b, __aarch64_vdupq_laneq_f16(__c, __lane)); } -FUNK float16x4_t vfms_n_f16(float16x4_t __a, float16x4_t __b, float16_t __c) { +__funline float16x4_t vfms_n_f16(float16x4_t __a, float16x4_t __b, + float16_t __c) { return vfms_f16(__a, __b, vdup_n_f16(__c)); } -FUNK float16x8_t vfmsq_n_f16(float16x8_t __a, float16x8_t __b, float16_t __c) { +__funline float16x8_t vfmsq_n_f16(float16x8_t __a, float16x8_t __b, + float16_t __c) { return vfmsq_f16(__a, __b, vdupq_n_f16(__c)); } -FUNK float16_t vmulh_lane_f16(float16_t __a, float16x4_t __b, - const int __lane) { +__funline float16_t vmulh_lane_f16(float16_t __a, float16x4_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float16x4_t vmul_lane_f16(float16x4_t __a, float16x4_t __b, - const int __lane) { +__funline float16x4_t vmul_lane_f16(float16x4_t __a, float16x4_t __b, + const int __lane) { return vmul_f16(__a, vdup_n_f16(__aarch64_vget_lane_any(__b, __lane))); } -FUNK float16x8_t vmulq_lane_f16(float16x8_t __a, float16x4_t __b, - const int __lane) { +__funline float16x8_t vmulq_lane_f16(float16x8_t __a, float16x4_t __b, + const int __lane) { return vmulq_f16(__a, vdupq_n_f16(__aarch64_vget_lane_any(__b, __lane))); } -FUNK float16_t vmulh_laneq_f16(float16_t __a, float16x8_t __b, - const int __lane) { +__funline float16_t vmulh_laneq_f16(float16_t __a, float16x8_t __b, + const int __lane) { return __a * __aarch64_vget_lane_any(__b, __lane); } -FUNK float16x4_t vmul_laneq_f16(float16x4_t __a, float16x8_t __b, - const int __lane) { +__funline float16x4_t vmul_laneq_f16(float16x4_t __a, float16x8_t __b, + const int __lane) { return vmul_f16(__a, vdup_n_f16(__aarch64_vget_lane_any(__b, __lane))); } -FUNK float16x8_t vmulq_laneq_f16(float16x8_t __a, float16x8_t __b, - const int __lane) { +__funline float16x8_t vmulq_laneq_f16(float16x8_t __a, float16x8_t __b, + const int __lane) { return vmulq_f16(__a, vdupq_n_f16(__aarch64_vget_lane_any(__b, __lane))); } -FUNK float16x4_t vmul_n_f16(float16x4_t __a, float16_t __b) { +__funline float16x4_t vmul_n_f16(float16x4_t __a, float16_t __b) { return vmul_lane_f16(__a, vdup_n_f16(__b), 0); } -FUNK float16x8_t vmulq_n_f16(float16x8_t __a, float16_t __b) { +__funline float16x8_t vmulq_n_f16(float16x8_t __a, float16_t __b) { return vmulq_laneq_f16(__a, vdupq_n_f16(__b), 0); } -FUNK float16_t vmulxh_lane_f16(float16_t __a, float16x4_t __b, - const int __lane) { +__funline float16_t vmulxh_lane_f16(float16_t __a, float16x4_t __b, + const int __lane) { return vmulxh_f16(__a, __aarch64_vget_lane_any(__b, __lane)); } -FUNK float16x4_t vmulx_lane_f16(float16x4_t __a, float16x4_t __b, - const int __lane) { +__funline float16x4_t vmulx_lane_f16(float16x4_t __a, float16x4_t __b, + const int __lane) { return vmulx_f16(__a, __aarch64_vdup_lane_f16(__b, __lane)); } -FUNK float16x8_t vmulxq_lane_f16(float16x8_t __a, float16x4_t __b, - const int __lane) { +__funline float16x8_t vmulxq_lane_f16(float16x8_t __a, float16x4_t __b, + const int __lane) { return vmulxq_f16(__a, __aarch64_vdupq_lane_f16(__b, __lane)); } -FUNK float16_t vmulxh_laneq_f16(float16_t __a, float16x8_t __b, - const int __lane) { +__funline float16_t vmulxh_laneq_f16(float16_t __a, float16x8_t __b, + const int __lane) { return vmulxh_f16(__a, __aarch64_vget_lane_any(__b, __lane)); } -FUNK float16x4_t vmulx_laneq_f16(float16x4_t __a, float16x8_t __b, - const int __lane) { +__funline float16x4_t vmulx_laneq_f16(float16x4_t __a, float16x8_t __b, + const int __lane) { return vmulx_f16(__a, __aarch64_vdup_laneq_f16(__b, __lane)); } -FUNK float16x8_t vmulxq_laneq_f16(float16x8_t __a, float16x8_t __b, - const int __lane) { +__funline float16x8_t vmulxq_laneq_f16(float16x8_t __a, float16x8_t __b, + const int __lane) { return vmulxq_f16(__a, __aarch64_vdupq_laneq_f16(__b, __lane)); } -FUNK float16x4_t vmulx_n_f16(float16x4_t __a, float16_t __b) { +__funline float16x4_t vmulx_n_f16(float16x4_t __a, float16_t __b) { return vmulx_f16(__a, vdup_n_f16(__b)); } -FUNK float16x8_t vmulxq_n_f16(float16x8_t __a, float16_t __b) { +__funline float16x8_t vmulxq_n_f16(float16x8_t __a, float16_t __b) { return vmulxq_f16(__a, vdupq_n_f16(__b)); } -FUNK float16_t vmaxv_f16(float16x4_t __a) { +__funline float16_t vmaxv_f16(float16x4_t __a) { return __builtin_aarch64_reduc_smax_nan_scal_v4hf(__a); } -FUNK float16_t vmaxvq_f16(float16x8_t __a) { +__funline float16_t vmaxvq_f16(float16x8_t __a) { return __builtin_aarch64_reduc_smax_nan_scal_v8hf(__a); } -FUNK float16_t vminv_f16(float16x4_t __a) { +__funline float16_t vminv_f16(float16x4_t __a) { return __builtin_aarch64_reduc_smin_nan_scal_v4hf(__a); } -FUNK float16_t vminvq_f16(float16x8_t __a) { +__funline float16_t vminvq_f16(float16x8_t __a) { return __builtin_aarch64_reduc_smin_nan_scal_v8hf(__a); } -FUNK float16_t vmaxnmv_f16(float16x4_t __a) { +__funline float16_t vmaxnmv_f16(float16x4_t __a) { return __builtin_aarch64_reduc_smax_scal_v4hf(__a); } -FUNK float16_t vmaxnmvq_f16(float16x8_t __a) { +__funline float16_t vmaxnmvq_f16(float16x8_t __a) { return __builtin_aarch64_reduc_smax_scal_v8hf(__a); } -FUNK float16_t vminnmv_f16(float16x4_t __a) { +__funline float16_t vminnmv_f16(float16x4_t __a) { return __builtin_aarch64_reduc_smin_scal_v4hf(__a); } -FUNK float16_t vminnmvq_f16(float16x8_t __a) { +__funline float16_t vminnmvq_f16(float16x8_t __a) { return __builtin_aarch64_reduc_smin_scal_v8hf(__a); } @@ -21441,59 +21570,59 @@ FUNK float16_t vminnmvq_f16(float16x8_t __a) { #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+dotprod") -FUNK uint32x2_t vdot_u32(uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { +__funline uint32x2_t vdot_u32(uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { return __builtin_aarch64_udotv8qi_uuuu(__r, __a, __b); } -FUNK uint32x4_t vdotq_u32(uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { +__funline uint32x4_t vdotq_u32(uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { return __builtin_aarch64_udotv16qi_uuuu(__r, __a, __b); } -FUNK int32x2_t vdot_s32(int32x2_t __r, int8x8_t __a, int8x8_t __b) { +__funline int32x2_t vdot_s32(int32x2_t __r, int8x8_t __a, int8x8_t __b) { return __builtin_aarch64_sdotv8qi(__r, __a, __b); } -FUNK int32x4_t vdotq_s32(int32x4_t __r, int8x16_t __a, int8x16_t __b) { +__funline int32x4_t vdotq_s32(int32x4_t __r, int8x16_t __a, int8x16_t __b) { return __builtin_aarch64_sdotv16qi(__r, __a, __b); } -FUNK uint32x2_t vdot_lane_u32(uint32x2_t __r, uint8x8_t __a, uint8x8_t __b, - const int __index) { +__funline uint32x2_t vdot_lane_u32(uint32x2_t __r, uint8x8_t __a, uint8x8_t __b, + const int __index) { return __builtin_aarch64_udot_lanev8qi_uuuus(__r, __a, __b, __index); } -FUNK uint32x2_t vdot_laneq_u32(uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, - const int __index) { +__funline uint32x2_t vdot_laneq_u32(uint32x2_t __r, uint8x8_t __a, + uint8x16_t __b, const int __index) { return __builtin_aarch64_udot_laneqv8qi_uuuus(__r, __a, __b, __index); } -FUNK uint32x4_t vdotq_lane_u32(uint32x4_t __r, uint8x16_t __a, uint8x8_t __b, - const int __index) { +__funline uint32x4_t vdotq_lane_u32(uint32x4_t __r, uint8x16_t __a, + uint8x8_t __b, const int __index) { return __builtin_aarch64_udot_lanev16qi_uuuus(__r, __a, __b, __index); } -FUNK uint32x4_t vdotq_laneq_u32(uint32x4_t __r, uint8x16_t __a, uint8x16_t __b, - const int __index) { +__funline uint32x4_t vdotq_laneq_u32(uint32x4_t __r, uint8x16_t __a, + uint8x16_t __b, const int __index) { return __builtin_aarch64_udot_laneqv16qi_uuuus(__r, __a, __b, __index); } -FUNK int32x2_t vdot_lane_s32(int32x2_t __r, int8x8_t __a, int8x8_t __b, - const int __index) { +__funline int32x2_t vdot_lane_s32(int32x2_t __r, int8x8_t __a, int8x8_t __b, + const int __index) { return __builtin_aarch64_sdot_lanev8qi(__r, __a, __b, __index); } -FUNK int32x2_t vdot_laneq_s32(int32x2_t __r, int8x8_t __a, int8x16_t __b, - const int __index) { +__funline int32x2_t vdot_laneq_s32(int32x2_t __r, int8x8_t __a, int8x16_t __b, + const int __index) { return __builtin_aarch64_sdot_laneqv8qi(__r, __a, __b, __index); } -FUNK int32x4_t vdotq_lane_s32(int32x4_t __r, int8x16_t __a, int8x8_t __b, - const int __index) { +__funline int32x4_t vdotq_lane_s32(int32x4_t __r, int8x16_t __a, int8x8_t __b, + const int __index) { return __builtin_aarch64_sdot_lanev16qi(__r, __a, __b, __index); } -FUNK int32x4_t vdotq_laneq_s32(int32x4_t __r, int8x16_t __a, int8x16_t __b, - const int __index) { +__funline int32x4_t vdotq_laneq_s32(int32x4_t __r, int8x16_t __a, int8x16_t __b, + const int __index) { return __builtin_aarch64_sdot_laneqv16qi(__r, __a, __b, __index); } #pragma GCC pop_options @@ -21501,44 +21630,45 @@ FUNK int32x4_t vdotq_laneq_s32(int32x4_t __r, int8x16_t __a, int8x16_t __b, #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+sm4") -FUNK uint32x4_t vsm3ss1q_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { +__funline uint32x4_t vsm3ss1q_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c) { return __builtin_aarch64_sm3ss1qv4si_uuuu(__a, __b, __c); } -FUNK uint32x4_t vsm3tt1aq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c, - const int __imm2) { +__funline uint32x4_t vsm3tt1aq_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c, const int __imm2) { return __builtin_aarch64_sm3tt1aqv4si_uuuus(__a, __b, __c, __imm2); } -FUNK uint32x4_t vsm3tt1bq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c, - const int __imm2) { +__funline uint32x4_t vsm3tt1bq_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c, const int __imm2) { return __builtin_aarch64_sm3tt1bqv4si_uuuus(__a, __b, __c, __imm2); } -FUNK uint32x4_t vsm3tt2aq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c, - const int __imm2) { +__funline uint32x4_t vsm3tt2aq_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c, const int __imm2) { return __builtin_aarch64_sm3tt2aqv4si_uuuus(__a, __b, __c, __imm2); } -FUNK uint32x4_t vsm3tt2bq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c, - const int __imm2) { +__funline uint32x4_t vsm3tt2bq_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c, const int __imm2) { return __builtin_aarch64_sm3tt2bqv4si_uuuus(__a, __b, __c, __imm2); } -FUNK uint32x4_t vsm3partw1q_u32(uint32x4_t __a, uint32x4_t __b, - uint32x4_t __c) { +__funline uint32x4_t vsm3partw1q_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c) { return __builtin_aarch64_sm3partw1qv4si_uuuu(__a, __b, __c); } -FUNK uint32x4_t vsm3partw2q_u32(uint32x4_t __a, uint32x4_t __b, - uint32x4_t __c) { +__funline uint32x4_t vsm3partw2q_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c) { return __builtin_aarch64_sm3partw2qv4si_uuuu(__a, __b, __c); } -FUNK uint32x4_t vsm4eq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vsm4eq_u32(uint32x4_t __a, uint32x4_t __b) { return __builtin_aarch64_sm4eqv4si_uuu(__a, __b); } -FUNK uint32x4_t vsm4ekeyq_u32(uint32x4_t __a, uint32x4_t __b) { +__funline uint32x4_t vsm4ekeyq_u32(uint32x4_t __a, uint32x4_t __b) { return __builtin_aarch64_sm4ekeyqv4si_uuu(__a, __b); } @@ -21547,92 +21677,100 @@ FUNK uint32x4_t vsm4ekeyq_u32(uint32x4_t __a, uint32x4_t __b) { #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+sha3") -FUNK uint64x2_t vsha512hq_u64(uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { +__funline uint64x2_t vsha512hq_u64(uint64x2_t __a, uint64x2_t __b, + uint64x2_t __c) { return __builtin_aarch64_crypto_sha512hqv2di_uuuu(__a, __b, __c); } -FUNK uint64x2_t vsha512h2q_u64(uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { +__funline uint64x2_t vsha512h2q_u64(uint64x2_t __a, uint64x2_t __b, + uint64x2_t __c) { return __builtin_aarch64_crypto_sha512h2qv2di_uuuu(__a, __b, __c); } -FUNK uint64x2_t vsha512su0q_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vsha512su0q_u64(uint64x2_t __a, uint64x2_t __b) { return __builtin_aarch64_crypto_sha512su0qv2di_uuu(__a, __b); } -FUNK uint64x2_t vsha512su1q_u64(uint64x2_t __a, uint64x2_t __b, - uint64x2_t __c) { +__funline uint64x2_t vsha512su1q_u64(uint64x2_t __a, uint64x2_t __b, + uint64x2_t __c) { return __builtin_aarch64_crypto_sha512su1qv2di_uuuu(__a, __b, __c); } -FUNK uint8x16_t veor3q_u8(uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { +__funline uint8x16_t veor3q_u8(uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { return __builtin_aarch64_eor3qv16qi_uuuu(__a, __b, __c); } -FUNK uint16x8_t veor3q_u16(uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { +__funline uint16x8_t veor3q_u16(uint16x8_t __a, uint16x8_t __b, + uint16x8_t __c) { return __builtin_aarch64_eor3qv8hi_uuuu(__a, __b, __c); } -FUNK uint32x4_t veor3q_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { +__funline uint32x4_t veor3q_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c) { return __builtin_aarch64_eor3qv4si_uuuu(__a, __b, __c); } -FUNK uint64x2_t veor3q_u64(uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { +__funline uint64x2_t veor3q_u64(uint64x2_t __a, uint64x2_t __b, + uint64x2_t __c) { return __builtin_aarch64_eor3qv2di_uuuu(__a, __b, __c); } -FUNK int8x16_t veor3q_s8(int8x16_t __a, int8x16_t __b, int8x16_t __c) { +__funline int8x16_t veor3q_s8(int8x16_t __a, int8x16_t __b, int8x16_t __c) { return __builtin_aarch64_eor3qv16qi(__a, __b, __c); } -FUNK int16x8_t veor3q_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) { +__funline int16x8_t veor3q_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) { return __builtin_aarch64_eor3qv8hi(__a, __b, __c); } -FUNK int32x4_t veor3q_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) { +__funline int32x4_t veor3q_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) { return __builtin_aarch64_eor3qv4si(__a, __b, __c); } -FUNK int64x2_t veor3q_s64(int64x2_t __a, int64x2_t __b, int64x2_t __c) { +__funline int64x2_t veor3q_s64(int64x2_t __a, int64x2_t __b, int64x2_t __c) { return __builtin_aarch64_eor3qv2di(__a, __b, __c); } -FUNK uint64x2_t vrax1q_u64(uint64x2_t __a, uint64x2_t __b) { +__funline uint64x2_t vrax1q_u64(uint64x2_t __a, uint64x2_t __b) { return __builtin_aarch64_rax1qv2di_uuu(__a, __b); } -FUNK uint64x2_t vxarq_u64(uint64x2_t __a, uint64x2_t __b, const int imm6) { +__funline uint64x2_t vxarq_u64(uint64x2_t __a, uint64x2_t __b, const int imm6) { return __builtin_aarch64_xarqv2di_uuus(__a, __b, imm6); } -FUNK uint8x16_t vbcaxq_u8(uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { +__funline uint8x16_t vbcaxq_u8(uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { return __builtin_aarch64_bcaxqv16qi_uuuu(__a, __b, __c); } -FUNK uint16x8_t vbcaxq_u16(uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { +__funline uint16x8_t vbcaxq_u16(uint16x8_t __a, uint16x8_t __b, + uint16x8_t __c) { return __builtin_aarch64_bcaxqv8hi_uuuu(__a, __b, __c); } -FUNK uint32x4_t vbcaxq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { +__funline uint32x4_t vbcaxq_u32(uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c) { return __builtin_aarch64_bcaxqv4si_uuuu(__a, __b, __c); } -FUNK uint64x2_t vbcaxq_u64(uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { +__funline uint64x2_t vbcaxq_u64(uint64x2_t __a, uint64x2_t __b, + uint64x2_t __c) { return __builtin_aarch64_bcaxqv2di_uuuu(__a, __b, __c); } -FUNK int8x16_t vbcaxq_s8(int8x16_t __a, int8x16_t __b, int8x16_t __c) { +__funline int8x16_t vbcaxq_s8(int8x16_t __a, int8x16_t __b, int8x16_t __c) { return __builtin_aarch64_bcaxqv16qi(__a, __b, __c); } -FUNK int16x8_t vbcaxq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) { +__funline int16x8_t vbcaxq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) { return __builtin_aarch64_bcaxqv8hi(__a, __b, __c); } -FUNK int32x4_t vbcaxq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) { +__funline int32x4_t vbcaxq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) { return __builtin_aarch64_bcaxqv4si(__a, __b, __c); } -FUNK int64x2_t vbcaxq_s64(int64x2_t __a, int64x2_t __b, int64x2_t __c) { +__funline int64x2_t vbcaxq_s64(int64x2_t __a, int64x2_t __b, int64x2_t __c) { return __builtin_aarch64_bcaxqv2di(__a, __b, __c); } @@ -21643,299 +21781,326 @@ FUNK int64x2_t vbcaxq_s64(int64x2_t __a, int64x2_t __b, int64x2_t __c) { #pragma GCC push_options #pragma GCC target("+fp16") -FUNK float16x4_t vcadd_rot90_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vcadd_rot90_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_fcadd90v4hf(__a, __b); } -FUNK float16x8_t vcaddq_rot90_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vcaddq_rot90_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_fcadd90v8hf(__a, __b); } -FUNK float16x4_t vcadd_rot270_f16(float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vcadd_rot270_f16(float16x4_t __a, float16x4_t __b) { return __builtin_aarch64_fcadd270v4hf(__a, __b); } -FUNK float16x8_t vcaddq_rot270_f16(float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vcaddq_rot270_f16(float16x8_t __a, float16x8_t __b) { return __builtin_aarch64_fcadd270v8hf(__a, __b); } -FUNK float16x4_t vcmla_f16(float16x4_t __r, float16x4_t __a, float16x4_t __b) { +__funline float16x4_t vcmla_f16(float16x4_t __r, float16x4_t __a, + float16x4_t __b) { return __builtin_aarch64_fcmla0v4hf(__r, __a, __b); } -FUNK float16x8_t vcmlaq_f16(float16x8_t __r, float16x8_t __a, float16x8_t __b) { +__funline float16x8_t vcmlaq_f16(float16x8_t __r, float16x8_t __a, + float16x8_t __b) { return __builtin_aarch64_fcmla0v8hf(__r, __a, __b); } -FUNK float16x4_t vcmla_lane_f16(float16x4_t __r, float16x4_t __a, - float16x4_t __b, const int __index) { +__funline float16x4_t vcmla_lane_f16(float16x4_t __r, float16x4_t __a, + float16x4_t __b, const int __index) { return __builtin_aarch64_fcmla_lane0v4hf(__r, __a, __b, __index); } -FUNK float16x4_t vcmla_laneq_f16(float16x4_t __r, float16x4_t __a, - float16x8_t __b, const int __index) { +__funline float16x4_t vcmla_laneq_f16(float16x4_t __r, float16x4_t __a, + float16x8_t __b, const int __index) { return __builtin_aarch64_fcmla_laneq0v4hf(__r, __a, __b, __index); } -FUNK float16x8_t vcmlaq_lane_f16(float16x8_t __r, float16x8_t __a, - float16x4_t __b, const int __index) { +__funline float16x8_t vcmlaq_lane_f16(float16x8_t __r, float16x8_t __a, + float16x4_t __b, const int __index) { return __builtin_aarch64_fcmlaq_lane0v8hf(__r, __a, __b, __index); } -FUNK float16x8_t vcmlaq_rot90_lane_f16(float16x8_t __r, float16x8_t __a, - float16x4_t __b, const int __index) { +__funline float16x8_t vcmlaq_rot90_lane_f16(float16x8_t __r, float16x8_t __a, + float16x4_t __b, + const int __index) { return __builtin_aarch64_fcmlaq_lane90v8hf(__r, __a, __b, __index); } -FUNK float16x4_t vcmla_rot90_laneq_f16(float16x4_t __r, float16x4_t __a, - float16x8_t __b, const int __index) { +__funline float16x4_t vcmla_rot90_laneq_f16(float16x4_t __r, float16x4_t __a, + float16x8_t __b, + const int __index) { return __builtin_aarch64_fcmla_laneq90v4hf(__r, __a, __b, __index); } -FUNK float16x4_t vcmla_rot90_lane_f16(float16x4_t __r, float16x4_t __a, - float16x4_t __b, const int __index) { +__funline float16x4_t vcmla_rot90_lane_f16(float16x4_t __r, float16x4_t __a, + float16x4_t __b, const int __index) { return __builtin_aarch64_fcmla_lane90v4hf(__r, __a, __b, __index); } -FUNK float16x8_t vcmlaq_rot90_f16(float16x8_t __r, float16x8_t __a, - float16x8_t __b) { +__funline float16x8_t vcmlaq_rot90_f16(float16x8_t __r, float16x8_t __a, + float16x8_t __b) { return __builtin_aarch64_fcmla90v8hf(__r, __a, __b); } -FUNK float16x4_t vcmla_rot90_f16(float16x4_t __r, float16x4_t __a, - float16x4_t __b) { +__funline float16x4_t vcmla_rot90_f16(float16x4_t __r, float16x4_t __a, + float16x4_t __b) { return __builtin_aarch64_fcmla90v4hf(__r, __a, __b); } -FUNK float16x8_t vcmlaq_laneq_f16(float16x8_t __r, float16x8_t __a, - float16x8_t __b, const int __index) { +__funline float16x8_t vcmlaq_laneq_f16(float16x8_t __r, float16x8_t __a, + float16x8_t __b, const int __index) { return __builtin_aarch64_fcmla_lane0v8hf(__r, __a, __b, __index); } -FUNK float16x4_t vcmla_rot180_laneq_f16(float16x4_t __r, float16x4_t __a, - float16x8_t __b, const int __index) { +__funline float16x4_t vcmla_rot180_laneq_f16(float16x4_t __r, float16x4_t __a, + float16x8_t __b, + const int __index) { return __builtin_aarch64_fcmla_laneq180v4hf(__r, __a, __b, __index); } -FUNK float16x4_t vcmla_rot180_lane_f16(float16x4_t __r, float16x4_t __a, - float16x4_t __b, const int __index) { +__funline float16x4_t vcmla_rot180_lane_f16(float16x4_t __r, float16x4_t __a, + float16x4_t __b, + const int __index) { return __builtin_aarch64_fcmla_lane180v4hf(__r, __a, __b, __index); } -FUNK float16x8_t vcmlaq_rot180_f16(float16x8_t __r, float16x8_t __a, - float16x8_t __b) { +__funline float16x8_t vcmlaq_rot180_f16(float16x8_t __r, float16x8_t __a, + float16x8_t __b) { return __builtin_aarch64_fcmla180v8hf(__r, __a, __b); } -FUNK float16x4_t vcmla_rot180_f16(float16x4_t __r, float16x4_t __a, - float16x4_t __b) { +__funline float16x4_t vcmla_rot180_f16(float16x4_t __r, float16x4_t __a, + float16x4_t __b) { return __builtin_aarch64_fcmla180v4hf(__r, __a, __b); } -FUNK float16x8_t vcmlaq_rot90_laneq_f16(float16x8_t __r, float16x8_t __a, - float16x8_t __b, const int __index) { +__funline float16x8_t vcmlaq_rot90_laneq_f16(float16x8_t __r, float16x8_t __a, + float16x8_t __b, + const int __index) { return __builtin_aarch64_fcmla_lane90v8hf(__r, __a, __b, __index); } -FUNK float16x8_t vcmlaq_rot270_laneq_f16(float16x8_t __r, float16x8_t __a, - float16x8_t __b, const int __index) { +__funline float16x8_t vcmlaq_rot270_laneq_f16(float16x8_t __r, float16x8_t __a, + float16x8_t __b, + const int __index) { return __builtin_aarch64_fcmla_lane270v8hf(__r, __a, __b, __index); } -FUNK float16x8_t vcmlaq_rot270_lane_f16(float16x8_t __r, float16x8_t __a, - float16x4_t __b, const int __index) { +__funline float16x8_t vcmlaq_rot270_lane_f16(float16x8_t __r, float16x8_t __a, + float16x4_t __b, + const int __index) { return __builtin_aarch64_fcmlaq_lane270v8hf(__r, __a, __b, __index); } -FUNK float16x4_t vcmla_rot270_laneq_f16(float16x4_t __r, float16x4_t __a, - float16x8_t __b, const int __index) { +__funline float16x4_t vcmla_rot270_laneq_f16(float16x4_t __r, float16x4_t __a, + float16x8_t __b, + const int __index) { return __builtin_aarch64_fcmla_laneq270v4hf(__r, __a, __b, __index); } -FUNK float16x8_t vcmlaq_rot270_f16(float16x8_t __r, float16x8_t __a, - float16x8_t __b) { +__funline float16x8_t vcmlaq_rot270_f16(float16x8_t __r, float16x8_t __a, + float16x8_t __b) { return __builtin_aarch64_fcmla270v8hf(__r, __a, __b); } -FUNK float16x4_t vcmla_rot270_f16(float16x4_t __r, float16x4_t __a, - float16x4_t __b) { +__funline float16x4_t vcmla_rot270_f16(float16x4_t __r, float16x4_t __a, + float16x4_t __b) { return __builtin_aarch64_fcmla270v4hf(__r, __a, __b); } -FUNK float16x8_t vcmlaq_rot180_laneq_f16(float16x8_t __r, float16x8_t __a, - float16x8_t __b, const int __index) { +__funline float16x8_t vcmlaq_rot180_laneq_f16(float16x8_t __r, float16x8_t __a, + float16x8_t __b, + const int __index) { return __builtin_aarch64_fcmla_lane180v8hf(__r, __a, __b, __index); } -FUNK float16x8_t vcmlaq_rot180_lane_f16(float16x8_t __r, float16x8_t __a, - float16x4_t __b, const int __index) { +__funline float16x8_t vcmlaq_rot180_lane_f16(float16x8_t __r, float16x8_t __a, + float16x4_t __b, + const int __index) { return __builtin_aarch64_fcmlaq_lane180v8hf(__r, __a, __b, __index); } -FUNK float16x4_t vcmla_rot270_lane_f16(float16x4_t __r, float16x4_t __a, - float16x4_t __b, const int __index) { +__funline float16x4_t vcmla_rot270_lane_f16(float16x4_t __r, float16x4_t __a, + float16x4_t __b, + const int __index) { return __builtin_aarch64_fcmla_lane270v4hf(__r, __a, __b, __index); } #pragma GCC pop_options -FUNK float32x2_t vcadd_rot90_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vcadd_rot90_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_fcadd90v2sf(__a, __b); } -FUNK float32x4_t vcaddq_rot90_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vcaddq_rot90_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_fcadd90v4sf(__a, __b); } -FUNK float64x2_t vcaddq_rot90_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vcaddq_rot90_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_fcadd90v2df(__a, __b); } -FUNK float32x2_t vcadd_rot270_f32(float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vcadd_rot270_f32(float32x2_t __a, float32x2_t __b) { return __builtin_aarch64_fcadd270v2sf(__a, __b); } -FUNK float32x4_t vcaddq_rot270_f32(float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vcaddq_rot270_f32(float32x4_t __a, float32x4_t __b) { return __builtin_aarch64_fcadd270v4sf(__a, __b); } -FUNK float64x2_t vcaddq_rot270_f64(float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vcaddq_rot270_f64(float64x2_t __a, float64x2_t __b) { return __builtin_aarch64_fcadd270v2df(__a, __b); } -FUNK float32x2_t vcmla_f32(float32x2_t __r, float32x2_t __a, float32x2_t __b) { +__funline float32x2_t vcmla_f32(float32x2_t __r, float32x2_t __a, + float32x2_t __b) { return __builtin_aarch64_fcmla0v2sf(__r, __a, __b); } -FUNK float32x4_t vcmlaq_f32(float32x4_t __r, float32x4_t __a, float32x4_t __b) { +__funline float32x4_t vcmlaq_f32(float32x4_t __r, float32x4_t __a, + float32x4_t __b) { return __builtin_aarch64_fcmla0v4sf(__r, __a, __b); } -FUNK float64x2_t vcmlaq_f64(float64x2_t __r, float64x2_t __a, float64x2_t __b) { +__funline float64x2_t vcmlaq_f64(float64x2_t __r, float64x2_t __a, + float64x2_t __b) { return __builtin_aarch64_fcmla0v2df(__r, __a, __b); } -FUNK float32x2_t vcmla_lane_f32(float32x2_t __r, float32x2_t __a, - float32x2_t __b, const int __index) { +__funline float32x2_t vcmla_lane_f32(float32x2_t __r, float32x2_t __a, + float32x2_t __b, const int __index) { return __builtin_aarch64_fcmla_lane0v2sf(__r, __a, __b, __index); } -FUNK float32x2_t vcmla_laneq_f32(float32x2_t __r, float32x2_t __a, - float32x4_t __b, const int __index) { +__funline float32x2_t vcmla_laneq_f32(float32x2_t __r, float32x2_t __a, + float32x4_t __b, const int __index) { return __builtin_aarch64_fcmla_laneq0v2sf(__r, __a, __b, __index); } -FUNK float32x4_t vcmlaq_lane_f32(float32x4_t __r, float32x4_t __a, - float32x2_t __b, const int __index) { +__funline float32x4_t vcmlaq_lane_f32(float32x4_t __r, float32x4_t __a, + float32x2_t __b, const int __index) { return __builtin_aarch64_fcmlaq_lane0v4sf(__r, __a, __b, __index); } -FUNK float32x4_t vcmlaq_laneq_f32(float32x4_t __r, float32x4_t __a, - float32x4_t __b, const int __index) { +__funline float32x4_t vcmlaq_laneq_f32(float32x4_t __r, float32x4_t __a, + float32x4_t __b, const int __index) { return __builtin_aarch64_fcmla_lane0v4sf(__r, __a, __b, __index); } -FUNK float32x2_t vcmla_rot90_f32(float32x2_t __r, float32x2_t __a, - float32x2_t __b) { +__funline float32x2_t vcmla_rot90_f32(float32x2_t __r, float32x2_t __a, + float32x2_t __b) { return __builtin_aarch64_fcmla90v2sf(__r, __a, __b); } -FUNK float32x4_t vcmlaq_rot90_f32(float32x4_t __r, float32x4_t __a, - float32x4_t __b) { +__funline float32x4_t vcmlaq_rot90_f32(float32x4_t __r, float32x4_t __a, + float32x4_t __b) { return __builtin_aarch64_fcmla90v4sf(__r, __a, __b); } -FUNK float64x2_t vcmlaq_rot90_f64(float64x2_t __r, float64x2_t __a, - float64x2_t __b) { +__funline float64x2_t vcmlaq_rot90_f64(float64x2_t __r, float64x2_t __a, + float64x2_t __b) { return __builtin_aarch64_fcmla90v2df(__r, __a, __b); } -FUNK float32x2_t vcmla_rot90_lane_f32(float32x2_t __r, float32x2_t __a, - float32x2_t __b, const int __index) { +__funline float32x2_t vcmla_rot90_lane_f32(float32x2_t __r, float32x2_t __a, + float32x2_t __b, const int __index) { return __builtin_aarch64_fcmla_lane90v2sf(__r, __a, __b, __index); } -FUNK float32x2_t vcmla_rot90_laneq_f32(float32x2_t __r, float32x2_t __a, - float32x4_t __b, const int __index) { +__funline float32x2_t vcmla_rot90_laneq_f32(float32x2_t __r, float32x2_t __a, + float32x4_t __b, + const int __index) { return __builtin_aarch64_fcmla_laneq90v2sf(__r, __a, __b, __index); } -FUNK float32x4_t vcmlaq_rot90_lane_f32(float32x4_t __r, float32x4_t __a, - float32x2_t __b, const int __index) { +__funline float32x4_t vcmlaq_rot90_lane_f32(float32x4_t __r, float32x4_t __a, + float32x2_t __b, + const int __index) { return __builtin_aarch64_fcmlaq_lane90v4sf(__r, __a, __b, __index); } -FUNK float32x4_t vcmlaq_rot90_laneq_f32(float32x4_t __r, float32x4_t __a, - float32x4_t __b, const int __index) { +__funline float32x4_t vcmlaq_rot90_laneq_f32(float32x4_t __r, float32x4_t __a, + float32x4_t __b, + const int __index) { return __builtin_aarch64_fcmla_lane90v4sf(__r, __a, __b, __index); } -FUNK float32x2_t vcmla_rot180_f32(float32x2_t __r, float32x2_t __a, - float32x2_t __b) { +__funline float32x2_t vcmla_rot180_f32(float32x2_t __r, float32x2_t __a, + float32x2_t __b) { return __builtin_aarch64_fcmla180v2sf(__r, __a, __b); } -FUNK float32x4_t vcmlaq_rot180_f32(float32x4_t __r, float32x4_t __a, - float32x4_t __b) { +__funline float32x4_t vcmlaq_rot180_f32(float32x4_t __r, float32x4_t __a, + float32x4_t __b) { return __builtin_aarch64_fcmla180v4sf(__r, __a, __b); } -FUNK float64x2_t vcmlaq_rot180_f64(float64x2_t __r, float64x2_t __a, - float64x2_t __b) { +__funline float64x2_t vcmlaq_rot180_f64(float64x2_t __r, float64x2_t __a, + float64x2_t __b) { return __builtin_aarch64_fcmla180v2df(__r, __a, __b); } -FUNK float32x2_t vcmla_rot180_lane_f32(float32x2_t __r, float32x2_t __a, - float32x2_t __b, const int __index) { +__funline float32x2_t vcmla_rot180_lane_f32(float32x2_t __r, float32x2_t __a, + float32x2_t __b, + const int __index) { return __builtin_aarch64_fcmla_lane180v2sf(__r, __a, __b, __index); } -FUNK float32x2_t vcmla_rot180_laneq_f32(float32x2_t __r, float32x2_t __a, - float32x4_t __b, const int __index) { +__funline float32x2_t vcmla_rot180_laneq_f32(float32x2_t __r, float32x2_t __a, + float32x4_t __b, + const int __index) { return __builtin_aarch64_fcmla_laneq180v2sf(__r, __a, __b, __index); } -FUNK float32x4_t vcmlaq_rot180_lane_f32(float32x4_t __r, float32x4_t __a, - float32x2_t __b, const int __index) { +__funline float32x4_t vcmlaq_rot180_lane_f32(float32x4_t __r, float32x4_t __a, + float32x2_t __b, + const int __index) { return __builtin_aarch64_fcmlaq_lane180v4sf(__r, __a, __b, __index); } -FUNK float32x4_t vcmlaq_rot180_laneq_f32(float32x4_t __r, float32x4_t __a, - float32x4_t __b, const int __index) { +__funline float32x4_t vcmlaq_rot180_laneq_f32(float32x4_t __r, float32x4_t __a, + float32x4_t __b, + const int __index) { return __builtin_aarch64_fcmla_lane180v4sf(__r, __a, __b, __index); } -FUNK float32x2_t vcmla_rot270_f32(float32x2_t __r, float32x2_t __a, - float32x2_t __b) { +__funline float32x2_t vcmla_rot270_f32(float32x2_t __r, float32x2_t __a, + float32x2_t __b) { return __builtin_aarch64_fcmla270v2sf(__r, __a, __b); } -FUNK float32x4_t vcmlaq_rot270_f32(float32x4_t __r, float32x4_t __a, - float32x4_t __b) { +__funline float32x4_t vcmlaq_rot270_f32(float32x4_t __r, float32x4_t __a, + float32x4_t __b) { return __builtin_aarch64_fcmla270v4sf(__r, __a, __b); } -FUNK float64x2_t vcmlaq_rot270_f64(float64x2_t __r, float64x2_t __a, - float64x2_t __b) { +__funline float64x2_t vcmlaq_rot270_f64(float64x2_t __r, float64x2_t __a, + float64x2_t __b) { return __builtin_aarch64_fcmla270v2df(__r, __a, __b); } -FUNK float32x2_t vcmla_rot270_lane_f32(float32x2_t __r, float32x2_t __a, - float32x2_t __b, const int __index) { +__funline float32x2_t vcmla_rot270_lane_f32(float32x2_t __r, float32x2_t __a, + float32x2_t __b, + const int __index) { return __builtin_aarch64_fcmla_lane270v2sf(__r, __a, __b, __index); } -FUNK float32x2_t vcmla_rot270_laneq_f32(float32x2_t __r, float32x2_t __a, - float32x4_t __b, const int __index) { +__funline float32x2_t vcmla_rot270_laneq_f32(float32x2_t __r, float32x2_t __a, + float32x4_t __b, + const int __index) { return __builtin_aarch64_fcmla_laneq270v2sf(__r, __a, __b, __index); } -FUNK float32x4_t vcmlaq_rot270_lane_f32(float32x4_t __r, float32x4_t __a, - float32x2_t __b, const int __index) { +__funline float32x4_t vcmlaq_rot270_lane_f32(float32x4_t __r, float32x4_t __a, + float32x2_t __b, + const int __index) { return __builtin_aarch64_fcmlaq_lane270v4sf(__r, __a, __b, __index); } -FUNK float32x4_t vcmlaq_rot270_laneq_f32(float32x4_t __r, float32x4_t __a, - float32x4_t __b, const int __index) { +__funline float32x4_t vcmlaq_rot270_laneq_f32(float32x4_t __r, float32x4_t __a, + float32x4_t __b, + const int __index) { return __builtin_aarch64_fcmla_lane270v4sf(__r, __a, __b, __index); } @@ -21944,123 +22109,125 @@ FUNK float32x4_t vcmlaq_rot270_laneq_f32(float32x4_t __r, float32x4_t __a, #pragma GCC push_options #pragma GCC target("arch=armv8.2-a+fp16fml") -FUNK float32x2_t vfmlal_low_f16(float32x2_t __r, float16x4_t __a, - float16x4_t __b) { +__funline float32x2_t vfmlal_low_f16(float32x2_t __r, float16x4_t __a, + float16x4_t __b) { return __builtin_aarch64_fmlal_lowv2sf(__r, __a, __b); } -FUNK float32x2_t vfmlsl_low_f16(float32x2_t __r, float16x4_t __a, - float16x4_t __b) { +__funline float32x2_t vfmlsl_low_f16(float32x2_t __r, float16x4_t __a, + float16x4_t __b) { return __builtin_aarch64_fmlsl_lowv2sf(__r, __a, __b); } -FUNK float32x4_t vfmlalq_low_f16(float32x4_t __r, float16x8_t __a, - float16x8_t __b) { +__funline float32x4_t vfmlalq_low_f16(float32x4_t __r, float16x8_t __a, + float16x8_t __b) { return __builtin_aarch64_fmlalq_lowv4sf(__r, __a, __b); } -FUNK float32x4_t vfmlslq_low_f16(float32x4_t __r, float16x8_t __a, - float16x8_t __b) { +__funline float32x4_t vfmlslq_low_f16(float32x4_t __r, float16x8_t __a, + float16x8_t __b) { return __builtin_aarch64_fmlslq_lowv4sf(__r, __a, __b); } -FUNK float32x2_t vfmlal_high_f16(float32x2_t __r, float16x4_t __a, - float16x4_t __b) { +__funline float32x2_t vfmlal_high_f16(float32x2_t __r, float16x4_t __a, + float16x4_t __b) { return __builtin_aarch64_fmlal_highv2sf(__r, __a, __b); } -FUNK float32x2_t vfmlsl_high_f16(float32x2_t __r, float16x4_t __a, - float16x4_t __b) { +__funline float32x2_t vfmlsl_high_f16(float32x2_t __r, float16x4_t __a, + float16x4_t __b) { return __builtin_aarch64_fmlsl_highv2sf(__r, __a, __b); } -FUNK float32x4_t vfmlalq_high_f16(float32x4_t __r, float16x8_t __a, - float16x8_t __b) { +__funline float32x4_t vfmlalq_high_f16(float32x4_t __r, float16x8_t __a, + float16x8_t __b) { return __builtin_aarch64_fmlalq_highv4sf(__r, __a, __b); } -FUNK float32x4_t vfmlslq_high_f16(float32x4_t __r, float16x8_t __a, - float16x8_t __b) { +__funline float32x4_t vfmlslq_high_f16(float32x4_t __r, float16x8_t __a, + float16x8_t __b) { return __builtin_aarch64_fmlslq_highv4sf(__r, __a, __b); } -FUNK float32x2_t vfmlal_lane_low_f16(float32x2_t __r, float16x4_t __a, - float16x4_t __b, const int __lane) { +__funline float32x2_t vfmlal_lane_low_f16(float32x2_t __r, float16x4_t __a, + float16x4_t __b, const int __lane) { return __builtin_aarch64_fmlal_lane_lowv2sf(__r, __a, __b, __lane); } -FUNK float32x2_t vfmlsl_lane_low_f16(float32x2_t __r, float16x4_t __a, - float16x4_t __b, const int __lane) { +__funline float32x2_t vfmlsl_lane_low_f16(float32x2_t __r, float16x4_t __a, + float16x4_t __b, const int __lane) { return __builtin_aarch64_fmlsl_lane_lowv2sf(__r, __a, __b, __lane); } -FUNK float32x2_t vfmlal_laneq_low_f16(float32x2_t __r, float16x4_t __a, - float16x8_t __b, const int __lane) { +__funline float32x2_t vfmlal_laneq_low_f16(float32x2_t __r, float16x4_t __a, + float16x8_t __b, const int __lane) { return __builtin_aarch64_fmlal_laneq_lowv2sf(__r, __a, __b, __lane); } -FUNK float32x2_t vfmlsl_laneq_low_f16(float32x2_t __r, float16x4_t __a, - float16x8_t __b, const int __lane) { +__funline float32x2_t vfmlsl_laneq_low_f16(float32x2_t __r, float16x4_t __a, + float16x8_t __b, const int __lane) { return __builtin_aarch64_fmlsl_laneq_lowv2sf(__r, __a, __b, __lane); } -FUNK float32x4_t vfmlalq_lane_low_f16(float32x4_t __r, float16x8_t __a, - float16x4_t __b, const int __lane) { +__funline float32x4_t vfmlalq_lane_low_f16(float32x4_t __r, float16x8_t __a, + float16x4_t __b, const int __lane) { return __builtin_aarch64_fmlalq_lane_lowv4sf(__r, __a, __b, __lane); } -FUNK float32x4_t vfmlslq_lane_low_f16(float32x4_t __r, float16x8_t __a, - float16x4_t __b, const int __lane) { +__funline float32x4_t vfmlslq_lane_low_f16(float32x4_t __r, float16x8_t __a, + float16x4_t __b, const int __lane) { return __builtin_aarch64_fmlslq_lane_lowv4sf(__r, __a, __b, __lane); } -FUNK float32x4_t vfmlalq_laneq_low_f16(float32x4_t __r, float16x8_t __a, - float16x8_t __b, const int __lane) { +__funline float32x4_t vfmlalq_laneq_low_f16(float32x4_t __r, float16x8_t __a, + float16x8_t __b, const int __lane) { return __builtin_aarch64_fmlalq_laneq_lowv4sf(__r, __a, __b, __lane); } -FUNK float32x4_t vfmlslq_laneq_low_f16(float32x4_t __r, float16x8_t __a, - float16x8_t __b, const int __lane) { +__funline float32x4_t vfmlslq_laneq_low_f16(float32x4_t __r, float16x8_t __a, + float16x8_t __b, const int __lane) { return __builtin_aarch64_fmlslq_laneq_lowv4sf(__r, __a, __b, __lane); } -FUNK float32x2_t vfmlal_lane_high_f16(float32x2_t __r, float16x4_t __a, - float16x4_t __b, const int __lane) { +__funline float32x2_t vfmlal_lane_high_f16(float32x2_t __r, float16x4_t __a, + float16x4_t __b, const int __lane) { return __builtin_aarch64_fmlal_lane_highv2sf(__r, __a, __b, __lane); } -FUNK float32x2_t vfmlsl_lane_high_f16(float32x2_t __r, float16x4_t __a, - float16x4_t __b, const int __lane) { +__funline float32x2_t vfmlsl_lane_high_f16(float32x2_t __r, float16x4_t __a, + float16x4_t __b, const int __lane) { return __builtin_aarch64_fmlsl_lane_highv2sf(__r, __a, __b, __lane); } -FUNK float32x2_t vfmlal_laneq_high_f16(float32x2_t __r, float16x4_t __a, - float16x8_t __b, const int __lane) { +__funline float32x2_t vfmlal_laneq_high_f16(float32x2_t __r, float16x4_t __a, + float16x8_t __b, const int __lane) { return __builtin_aarch64_fmlal_laneq_highv2sf(__r, __a, __b, __lane); } -FUNK float32x2_t vfmlsl_laneq_high_f16(float32x2_t __r, float16x4_t __a, - float16x8_t __b, const int __lane) { +__funline float32x2_t vfmlsl_laneq_high_f16(float32x2_t __r, float16x4_t __a, + float16x8_t __b, const int __lane) { return __builtin_aarch64_fmlsl_laneq_highv2sf(__r, __a, __b, __lane); } -FUNK float32x4_t vfmlalq_lane_high_f16(float32x4_t __r, float16x8_t __a, - float16x4_t __b, const int __lane) { +__funline float32x4_t vfmlalq_lane_high_f16(float32x4_t __r, float16x8_t __a, + float16x4_t __b, const int __lane) { return __builtin_aarch64_fmlalq_lane_highv4sf(__r, __a, __b, __lane); } -FUNK float32x4_t vfmlslq_lane_high_f16(float32x4_t __r, float16x8_t __a, - float16x4_t __b, const int __lane) { +__funline float32x4_t vfmlslq_lane_high_f16(float32x4_t __r, float16x8_t __a, + float16x4_t __b, const int __lane) { return __builtin_aarch64_fmlslq_lane_highv4sf(__r, __a, __b, __lane); } -FUNK float32x4_t vfmlalq_laneq_high_f16(float32x4_t __r, float16x8_t __a, - float16x8_t __b, const int __lane) { +__funline float32x4_t vfmlalq_laneq_high_f16(float32x4_t __r, float16x8_t __a, + float16x8_t __b, + const int __lane) { return __builtin_aarch64_fmlalq_laneq_highv4sf(__r, __a, __b, __lane); } -FUNK float32x4_t vfmlslq_laneq_high_f16(float32x4_t __r, float16x8_t __a, - float16x8_t __b, const int __lane) { +__funline float32x4_t vfmlslq_laneq_high_f16(float32x4_t __r, float16x8_t __a, + float16x8_t __b, + const int __lane) { return __builtin_aarch64_fmlslq_laneq_highv4sf(__r, __a, __b, __lane); } @@ -22123,6 +22290,5 @@ FUNK float32x4_t vfmlslq_laneq_high_f16(float32x4_t __r, float16x8_t __a, #undef __aarch64_vdupq_laneq_u64 #pragma GCC push_options /* -Wno-missing-braces */ -#undef FUNK -#endif /* __aarch64__ */ -#endif /* _AARCH64_NEON_H_ */ +#endif /* __aarch64__ */ +#endif /* _AARCH64_NEON_H_ */ diff --git a/third_party/ggml/ggml.c b/third_party/ggml/ggml.c index aefd4f9ce..6a3f07401 100644 --- a/third_party/ggml/ggml.c +++ b/third_party/ggml/ggml.c @@ -1784,24 +1784,40 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); + // // Main loop - for (int i = 0; i < nb; ++i) { - /* Compute combined scale for the block */ - const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); - - __m256i bx = bytes_from_nibbles_32(x[i].qs); - - // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. - const __m256i off = _mm256_set1_epi8( 8 ); - bx = _mm256_sub_epi8( bx, off ); - - __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - - const __m256 q = mul_sum_i8_pairs_float(bx, by); - - /* Multiply q with scale and accumulate */ - acc = _mm256_fmadd_ps( d, q, acc ); + // +#define WORK(I) \ + /* Compute combined scale for the block */ \ + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \ + __m256i bx = bytes_from_nibbles_32(x[I].qs); \ + /* Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */ \ + const __m256i off = _mm256_set1_epi8( 8 ); \ + bx = _mm256_sub_epi8( bx, off ); \ + __m256i by = _mm256_loadu_si256((const __m256i *)y[I].qs); \ + const __m256 q = mul_sum_i8_pairs_float(bx, by); \ + /* Multiply q with scale and accumulate */ \ + acc = _mm256_fmadd_ps( d, q, acc ) + int i = 0; + for (; i + 12 < nb; i += 12) { + _mm_prefetch(x+i+12, 3); + _mm_prefetch(x+i+15, 3); + _mm_prefetch(x+i+18, 3); + _mm_prefetch(x+i+21, 3); + _mm_prefetch(y+i+12, 3); + _mm_prefetch(y+i+14, 3); + _mm_prefetch(y+i+16, 3); + _mm_prefetch(y+i+18, 3); + _mm_prefetch(y+i+20, 3); + _mm_prefetch(y+i+22, 3); + for (int j = 0; j < 12; ++j) { + WORK(i+j); + } } + for (; i < nb; ++i) { + WORK(i); + } +#undef WORK *s = hsum_float_8(acc); #elif defined(__AVX__) diff --git a/third_party/intel/adxintrin.internal.h b/third_party/intel/adxintrin.internal.h index fbfbbbc89..b0f4e9b02 100644 --- a/third_party/intel/adxintrin.internal.h +++ b/third_party/intel/adxintrin.internal.h @@ -5,46 +5,37 @@ #ifndef _ADXINTRIN_H_INCLUDED #define _ADXINTRIN_H_INCLUDED -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _subborrow_u32(unsigned char __CF, unsigned int __X, unsigned int __Y, - unsigned int *__P) { +__funline unsigned char _subborrow_u32(unsigned char __CF, unsigned int __X, + unsigned int __Y, unsigned int *__P) { return __builtin_ia32_sbb_u32(__CF, __X, __Y, __P); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _addcarry_u32(unsigned char __CF, unsigned int __X, unsigned int __Y, - unsigned int *__P) { +__funline unsigned char _addcarry_u32(unsigned char __CF, unsigned int __X, + unsigned int __Y, unsigned int *__P) { return __builtin_ia32_addcarryx_u32(__CF, __X, __Y, __P); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _addcarryx_u32(unsigned char __CF, unsigned int __X, unsigned int __Y, - unsigned int *__P) { +__funline unsigned char _addcarryx_u32(unsigned char __CF, unsigned int __X, + unsigned int __Y, unsigned int *__P) { return __builtin_ia32_addcarryx_u32(__CF, __X, __Y, __P); } #ifdef __x86_64__ -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _subborrow_u64(unsigned char __CF, unsigned long long __X, - unsigned long long __Y, unsigned long long *__P) { +__funline unsigned char _subborrow_u64(unsigned char __CF, unsigned long long __X, + unsigned long long __Y, + unsigned long long *__P) { return __builtin_ia32_sbb_u64(__CF, __X, __Y, __P); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _addcarry_u64(unsigned char __CF, unsigned long long __X, - unsigned long long __Y, unsigned long long *__P) { +__funline unsigned char _addcarry_u64(unsigned char __CF, unsigned long long __X, + unsigned long long __Y, + unsigned long long *__P) { return __builtin_ia32_addcarryx_u64(__CF, __X, __Y, __P); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _addcarryx_u64(unsigned char __CF, unsigned long long __X, - unsigned long long __Y, unsigned long long *__P) { +__funline unsigned char _addcarryx_u64(unsigned char __CF, unsigned long long __X, + unsigned long long __Y, + unsigned long long *__P) { return __builtin_ia32_addcarryx_u64(__CF, __X, __Y, __P); } #endif diff --git a/third_party/intel/ammintrin.internal.h b/third_party/intel/ammintrin.internal.h index 9e899067d..66045f832 100644 --- a/third_party/intel/ammintrin.internal.h +++ b/third_party/intel/ammintrin.internal.h @@ -9,28 +9,21 @@ #define __DISABLE_SSE4A__ #endif /* __SSE4A__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_stream_sd(double* __P, __m128d __Y) { +__funline void _mm_stream_sd(double* __P, __m128d __Y) { __builtin_ia32_movntsd(__P, (__v2df)__Y); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_stream_ss(float* __P, __m128 __Y) { +__funline void _mm_stream_ss(float* __P, __m128 __Y) { __builtin_ia32_movntss(__P, (__v4sf)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_si64(__m128i __X, __m128i __Y) { +__funline __m128i _mm_extract_si64(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_extrq((__v2di)__X, (__v16qi)__Y); } #ifdef __OPTIMIZE__ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_extracti_si64(__m128i __X, unsigned const int __I, unsigned const int __L) { +__funline __m128i _mm_extracti_si64(__m128i __X, unsigned const int __I, + unsigned const int __L) { return (__m128i)__builtin_ia32_extrqi((__v2di)__X, __I, __L); } #else @@ -39,17 +32,14 @@ _mm_extracti_si64(__m128i __X, unsigned const int __I, unsigned const int __L) { (unsigned int)(L))) #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_si64(__m128i __X, __m128i __Y) { +__funline __m128i _mm_insert_si64(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_insertq((__v2di)__X, (__v2di)__Y); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I, - unsigned const int __L) { +__funline __m128i _mm_inserti_si64(__m128i __X, __m128i __Y, + unsigned const int __I, + unsigned const int __L) { return (__m128i)__builtin_ia32_insertqi((__v2di)__X, (__v2di)__Y, __I, __L); } #else diff --git a/third_party/intel/avx2intrin.internal.h b/third_party/intel/avx2intrin.internal.h index 524eebd02..6806f5ef6 100644 --- a/third_party/intel/avx2intrin.internal.h +++ b/third_party/intel/avx2intrin.internal.h @@ -12,9 +12,7 @@ #endif /* __AVX2__ */ #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mpsadbw_epu8(__m256i __X, __m256i __Y, const int __M) { +__funline __m256i _mm256_mpsadbw_epu8(__m256i __X, __m256i __Y, const int __M) { return (__m256i)__builtin_ia32_mpsadbw256((__v32qi)__X, (__v32qi)__Y, __M); } #else @@ -23,100 +21,68 @@ extern __inline __m256i (__v32qi)(__m256i)(Y), (int)(M))) #endif -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_abs_epi8(__m256i __A) { +__funline __m256i _mm256_abs_epi8(__m256i __A) { return (__m256i)__builtin_ia32_pabsb256((__v32qi)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_abs_epi16(__m256i __A) { +__funline __m256i _mm256_abs_epi16(__m256i __A) { return (__m256i)__builtin_ia32_pabsw256((__v16hi)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_abs_epi32(__m256i __A) { +__funline __m256i _mm256_abs_epi32(__m256i __A) { return (__m256i)__builtin_ia32_pabsd256((__v8si)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_packs_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_packs_epi32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_packssdw256((__v8si)__A, (__v8si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_packs_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_packs_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_packsswb256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_packus_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_packus_epi32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_packusdw256((__v8si)__A, (__v8si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_packus_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_packus_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_packuswb256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_add_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_add_epi8(__m256i __A, __m256i __B) { return (__m256i)((__v32qu)__A + (__v32qu)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_add_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_add_epi16(__m256i __A, __m256i __B) { return (__m256i)((__v16hu)__A + (__v16hu)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_add_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_add_epi32(__m256i __A, __m256i __B) { return (__m256i)((__v8su)__A + (__v8su)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_add_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_add_epi64(__m256i __A, __m256i __B) { return (__m256i)((__v4du)__A + (__v4du)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_adds_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_adds_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_paddsb256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_adds_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_adds_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_paddsw256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_adds_epu8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_adds_epu8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_paddusb256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_adds_epu16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_adds_epu16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_paddusw256((__v16hi)__A, (__v16hi)__B); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_alignr_epi8(__m256i __A, __m256i __B, const int __N) { +__funline __m256i _mm256_alignr_epi8(__m256i __A, __m256i __B, const int __N) { return (__m256i)__builtin_ia32_palignr256((__v4di)__A, (__v4di)__B, __N * 8); } #else @@ -125,41 +91,29 @@ extern __inline __m256i (__v4di)(__m256i)(B), (int)(N)*8)) #endif -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_and_si256(__m256i __A, __m256i __B) { +__funline __m256i _mm256_and_si256(__m256i __A, __m256i __B) { return (__m256i)((__v4du)__A & (__v4du)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_andnot_si256(__m256i __A, __m256i __B) { +__funline __m256i _mm256_andnot_si256(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_andnotsi256((__v4di)__A, (__v4di)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_avg_epu8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_avg_epu8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pavgb256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_avg_epu16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_avg_epu16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pavgw256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_blendv_epi8(__m256i __X, __m256i __Y, __m256i __M) { +__funline __m256i _mm256_blendv_epi8(__m256i __X, __m256i __Y, __m256i __M) { return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__X, (__v32qi)__Y, (__v32qi)__M); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_blend_epi16(__m256i __X, __m256i __Y, const int __M) { +__funline __m256i _mm256_blend_epi16(__m256i __X, __m256i __Y, const int __M) { return (__m256i)__builtin_ia32_pblendw256((__v16hi)__X, (__v16hi)__Y, __M); } #else @@ -168,328 +122,220 @@ extern __inline __m256i (__v16hi)(__m256i)(Y), (int)(M))) #endif -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_cmpeq_epi8(__m256i __A, __m256i __B) { return (__m256i)((__v32qi)__A == (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_cmpeq_epi16(__m256i __A, __m256i __B) { return (__m256i)((__v16hi)__A == (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_cmpeq_epi32(__m256i __A, __m256i __B) { return (__m256i)((__v8si)__A == (__v8si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_cmpeq_epi64(__m256i __A, __m256i __B) { return (__m256i)((__v4di)__A == (__v4di)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_cmpgt_epi8(__m256i __A, __m256i __B) { return (__m256i)((__v32qi)__A > (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_cmpgt_epi16(__m256i __A, __m256i __B) { return (__m256i)((__v16hi)__A > (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_cmpgt_epi32(__m256i __A, __m256i __B) { return (__m256i)((__v8si)__A > (__v8si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_cmpgt_epi64(__m256i __A, __m256i __B) { return (__m256i)((__v4di)__A > (__v4di)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_hadd_epi16(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_hadd_epi16(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_phaddw256((__v16hi)__X, (__v16hi)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_hadd_epi32(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_hadd_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_phaddd256((__v8si)__X, (__v8si)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_hadds_epi16(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_hadds_epi16(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__X, (__v16hi)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_hsub_epi16(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_hsub_epi16(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_phsubw256((__v16hi)__X, (__v16hi)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_hsub_epi32(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_hsub_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_phsubd256((__v8si)__X, (__v8si)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_hsubs_epi16(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_hsubs_epi16(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__X, (__v16hi)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maddubs_epi16(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_maddubs_epi16(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__X, (__v32qi)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_madd_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_madd_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_max_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_max_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_max_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_max_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_max_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_max_epi32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__A, (__v8si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_max_epu8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_max_epu8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_max_epu16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_max_epu16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_max_epu32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_max_epu32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxud256((__v8si)__A, (__v8si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_min_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_min_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminsb256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_min_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_min_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminsw256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_min_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_min_epi32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminsd256((__v8si)__A, (__v8si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_min_epu8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_min_epu8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminub256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_min_epu16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_min_epu16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminuw256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_min_epu32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_min_epu32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminud256((__v8si)__A, (__v8si)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movemask_epi8(__m256i __A) { +__funline int _mm256_movemask_epi8(__m256i __A) { return __builtin_ia32_pmovmskb256((__v32qi)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi8_epi16(__m128i __X) { +__funline __m256i _mm256_cvtepi8_epi16(__m128i __X) { return (__m256i)__builtin_ia32_pmovsxbw256((__v16qi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi8_epi32(__m128i __X) { +__funline __m256i _mm256_cvtepi8_epi32(__m128i __X) { return (__m256i)__builtin_ia32_pmovsxbd256((__v16qi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi8_epi64(__m128i __X) { +__funline __m256i _mm256_cvtepi8_epi64(__m128i __X) { return (__m256i)__builtin_ia32_pmovsxbq256((__v16qi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi16_epi32(__m128i __X) { +__funline __m256i _mm256_cvtepi16_epi32(__m128i __X) { return (__m256i)__builtin_ia32_pmovsxwd256((__v8hi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi16_epi64(__m128i __X) { +__funline __m256i _mm256_cvtepi16_epi64(__m128i __X) { return (__m256i)__builtin_ia32_pmovsxwq256((__v8hi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi32_epi64(__m128i __X) { +__funline __m256i _mm256_cvtepi32_epi64(__m128i __X) { return (__m256i)__builtin_ia32_pmovsxdq256((__v4si)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepu8_epi16(__m128i __X) { +__funline __m256i _mm256_cvtepu8_epi16(__m128i __X) { return (__m256i)__builtin_ia32_pmovzxbw256((__v16qi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepu8_epi32(__m128i __X) { +__funline __m256i _mm256_cvtepu8_epi32(__m128i __X) { return (__m256i)__builtin_ia32_pmovzxbd256((__v16qi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepu8_epi64(__m128i __X) { +__funline __m256i _mm256_cvtepu8_epi64(__m128i __X) { return (__m256i)__builtin_ia32_pmovzxbq256((__v16qi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepu16_epi32(__m128i __X) { +__funline __m256i _mm256_cvtepu16_epi32(__m128i __X) { return (__m256i)__builtin_ia32_pmovzxwd256((__v8hi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepu16_epi64(__m128i __X) { +__funline __m256i _mm256_cvtepu16_epi64(__m128i __X) { return (__m256i)__builtin_ia32_pmovzxwq256((__v8hi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepu32_epi64(__m128i __X) { +__funline __m256i _mm256_cvtepu32_epi64(__m128i __X) { return (__m256i)__builtin_ia32_pmovzxdq256((__v4si)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mul_epi32(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_mul_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_pmuldq256((__v8si)__X, (__v8si)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mulhrs_epi16(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_mulhrs_epi16(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__X, (__v16hi)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mulhi_epu16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_mulhi_epu16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mulhi_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_mulhi_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mullo_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_mullo_epi16(__m256i __A, __m256i __B) { return (__m256i)((__v16hu)__A * (__v16hu)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mullo_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_mullo_epi32(__m256i __A, __m256i __B) { return (__m256i)((__v8su)__A * (__v8su)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mul_epu32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_mul_epu32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmuludq256((__v8si)__A, (__v8si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_or_si256(__m256i __A, __m256i __B) { +__funline __m256i _mm256_or_si256(__m256i __A, __m256i __B) { return (__m256i)((__v4du)__A | (__v4du)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sad_epu8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_sad_epu8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psadbw256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shuffle_epi8(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_shuffle_epi8(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_pshufb256((__v32qi)__X, (__v32qi)__Y); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shuffle_epi32(__m256i __A, const int __mask) { +__funline __m256i _mm256_shuffle_epi32(__m256i __A, const int __mask) { return (__m256i)__builtin_ia32_pshufd256((__v8si)__A, __mask); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shufflehi_epi16(__m256i __A, const int __mask) { +__funline __m256i _mm256_shufflehi_epi16(__m256i __A, const int __mask) { return (__m256i)__builtin_ia32_pshufhw256((__v16hi)__A, __mask); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shufflelo_epi16(__m256i __A, const int __mask) { +__funline __m256i _mm256_shufflelo_epi16(__m256i __A, const int __mask) { return (__m256i)__builtin_ia32_pshuflw256((__v16hi)__A, __mask); } #else @@ -501,34 +347,24 @@ extern __inline __m256i ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(A), (int)(N))) #endif -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sign_epi8(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_sign_epi8(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psignb256((__v32qi)__X, (__v32qi)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sign_epi16(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_sign_epi16(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psignw256((__v16hi)__X, (__v16hi)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sign_epi32(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_sign_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psignd256((__v8si)__X, (__v8si)__Y); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_bslli_epi128(__m256i __A, const int __N) { +__funline __m256i _mm256_bslli_epi128(__m256i __A, const int __N) { return (__m256i)__builtin_ia32_pslldqi256(__A, __N * 8); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_slli_si256(__m256i __A, const int __N) { +__funline __m256i _mm256_slli_si256(__m256i __A, const int __N) { return (__m256i)__builtin_ia32_pslldqi256(__A, __N * 8); } #else @@ -538,76 +374,52 @@ extern __inline __m256i ((__m256i)__builtin_ia32_pslldqi256((__m256i)(A), (int)(N)*8)) #endif -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_slli_epi16(__m256i __A, int __B) { +__funline __m256i _mm256_slli_epi16(__m256i __A, int __B) { return (__m256i)__builtin_ia32_psllwi256((__v16hi)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sll_epi16(__m256i __A, __m128i __B) { +__funline __m256i _mm256_sll_epi16(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_slli_epi32(__m256i __A, int __B) { +__funline __m256i _mm256_slli_epi32(__m256i __A, int __B) { return (__m256i)__builtin_ia32_pslldi256((__v8si)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sll_epi32(__m256i __A, __m128i __B) { +__funline __m256i _mm256_sll_epi32(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_slli_epi64(__m256i __A, int __B) { +__funline __m256i _mm256_slli_epi64(__m256i __A, int __B) { return (__m256i)__builtin_ia32_psllqi256((__v4di)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sll_epi64(__m256i __A, __m128i __B) { +__funline __m256i _mm256_sll_epi64(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srai_epi16(__m256i __A, int __B) { +__funline __m256i _mm256_srai_epi16(__m256i __A, int __B) { return (__m256i)__builtin_ia32_psrawi256((__v16hi)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sra_epi16(__m256i __A, __m128i __B) { +__funline __m256i _mm256_sra_epi16(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psraw256((__v16hi)__A, (__v8hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srai_epi32(__m256i __A, int __B) { +__funline __m256i _mm256_srai_epi32(__m256i __A, int __B) { return (__m256i)__builtin_ia32_psradi256((__v8si)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sra_epi32(__m256i __A, __m128i __B) { +__funline __m256i _mm256_sra_epi32(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psrad256((__v8si)__A, (__v4si)__B); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_bsrli_epi128(__m256i __A, const int __N) { +__funline __m256i _mm256_bsrli_epi128(__m256i __A, const int __N) { return (__m256i)__builtin_ia32_psrldqi256(__A, __N * 8); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srli_si256(__m256i __A, const int __N) { +__funline __m256i _mm256_srli_si256(__m256i __A, const int __N) { return (__m256i)__builtin_ia32_psrldqi256(__A, __N * 8); } #else @@ -617,178 +429,120 @@ extern __inline __m256i ((__m256i)__builtin_ia32_psrldqi256((__m256i)(A), (int)(N)*8)) #endif -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srli_epi16(__m256i __A, int __B) { +__funline __m256i _mm256_srli_epi16(__m256i __A, int __B) { return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srl_epi16(__m256i __A, __m128i __B) { +__funline __m256i _mm256_srl_epi16(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srli_epi32(__m256i __A, int __B) { +__funline __m256i _mm256_srli_epi32(__m256i __A, int __B) { return (__m256i)__builtin_ia32_psrldi256((__v8si)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srl_epi32(__m256i __A, __m128i __B) { +__funline __m256i _mm256_srl_epi32(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srli_epi64(__m256i __A, int __B) { +__funline __m256i _mm256_srli_epi64(__m256i __A, int __B) { return (__m256i)__builtin_ia32_psrlqi256((__v4di)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srl_epi64(__m256i __A, __m128i __B) { +__funline __m256i _mm256_srl_epi64(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sub_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_sub_epi8(__m256i __A, __m256i __B) { return (__m256i)((__v32qu)__A - (__v32qu)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sub_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_sub_epi16(__m256i __A, __m256i __B) { return (__m256i)((__v16hu)__A - (__v16hu)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sub_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_sub_epi32(__m256i __A, __m256i __B) { return (__m256i)((__v8su)__A - (__v8su)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sub_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_sub_epi64(__m256i __A, __m256i __B) { return (__m256i)((__v4du)__A - (__v4du)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_subs_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_subs_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psubsb256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_subs_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_subs_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psubsw256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_subs_epu8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_subs_epu8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psubusb256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_subs_epu16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_subs_epu16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psubusw256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpackhi_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_unpackhi_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpckhbw256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpackhi_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_unpackhi_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpckhwd256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpackhi_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_unpackhi_epi32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpckhdq256((__v8si)__A, (__v8si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpackhi_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_unpackhi_epi64(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpckhqdq256((__v4di)__A, (__v4di)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpacklo_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_unpacklo_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpcklbw256((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpacklo_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_unpacklo_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpcklwd256((__v16hi)__A, (__v16hi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpacklo_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_unpacklo_epi32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpckldq256((__v8si)__A, (__v8si)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpacklo_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_unpacklo_epi64(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpcklqdq256((__v4di)__A, (__v4di)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_xor_si256(__m256i __A, __m256i __B) { +__funline __m256i _mm256_xor_si256(__m256i __A, __m256i __B) { return (__m256i)((__v4du)__A ^ (__v4du)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_stream_load_si256(__m256i const *__X) { +__funline __m256i _mm256_stream_load_si256(__m256i const *__X) { return (__m256i)__builtin_ia32_movntdqa256((__v4di *)__X); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_broadcastss_ps(__m128 __X) { +__funline __m128 _mm_broadcastss_ps(__m128 __X) { return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcastss_ps(__m128 __X) { +__funline __m256 _mm256_broadcastss_ps(__m128 __X) { return (__m256)__builtin_ia32_vbroadcastss_ps256((__v4sf)__X); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcastsd_pd(__m128d __X) { +__funline __m256d _mm256_broadcastsd_pd(__m128d __X) { return (__m256d)__builtin_ia32_vbroadcastsd_pd256((__v2df)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcastsi128_si256(__m128i __X) { +__funline __m256i _mm256_broadcastsi128_si256(__m128i __X) { return (__m256i)__builtin_ia32_vbroadcastsi256((__v2di)__X); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blend_epi32(__m128i __X, __m128i __Y, const int __M) { +__funline __m128i _mm_blend_epi32(__m128i __X, __m128i __Y, const int __M) { return (__m128i)__builtin_ia32_pblendd128((__v4si)__X, (__v4si)__Y, __M); } #else @@ -798,9 +552,7 @@ extern __inline __m128i #endif #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_blend_epi32(__m256i __X, __m256i __Y, const int __M) { +__funline __m256i _mm256_blend_epi32(__m256i __X, __m256i __Y, const int __M) { return (__m256i)__builtin_ia32_pblendd256((__v8si)__X, (__v8si)__Y, __M); } #else @@ -809,64 +561,44 @@ extern __inline __m256i (__v8si)(__m256i)(Y), (int)(M))) #endif -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcastb_epi8(__m128i __X) { +__funline __m256i _mm256_broadcastb_epi8(__m128i __X) { return (__m256i)__builtin_ia32_pbroadcastb256((__v16qi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcastw_epi16(__m128i __X) { +__funline __m256i _mm256_broadcastw_epi16(__m128i __X) { return (__m256i)__builtin_ia32_pbroadcastw256((__v8hi)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcastd_epi32(__m128i __X) { +__funline __m256i _mm256_broadcastd_epi32(__m128i __X) { return (__m256i)__builtin_ia32_pbroadcastd256((__v4si)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcastq_epi64(__m128i __X) { +__funline __m256i _mm256_broadcastq_epi64(__m128i __X) { return (__m256i)__builtin_ia32_pbroadcastq256((__v2di)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_broadcastb_epi8(__m128i __X) { +__funline __m128i _mm_broadcastb_epi8(__m128i __X) { return (__m128i)__builtin_ia32_pbroadcastb128((__v16qi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_broadcastw_epi16(__m128i __X) { +__funline __m128i _mm_broadcastw_epi16(__m128i __X) { return (__m128i)__builtin_ia32_pbroadcastw128((__v8hi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_broadcastd_epi32(__m128i __X) { +__funline __m128i _mm_broadcastd_epi32(__m128i __X) { return (__m128i)__builtin_ia32_pbroadcastd128((__v4si)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_broadcastq_epi64(__m128i __X) { +__funline __m128i _mm_broadcastq_epi64(__m128i __X) { return (__m128i)__builtin_ia32_pbroadcastq128((__v2di)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutevar8x32_epi32(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_permutevar8x32_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_permvarsi256((__v8si)__X, (__v8si)__Y); } #ifdef __OPTIMIZE__ -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permute4x64_pd(__m256d __X, const int __M) { +__funline __m256d _mm256_permute4x64_pd(__m256d __X, const int __M) { return (__m256d)__builtin_ia32_permdf256((__v4df)__X, __M); } #else @@ -874,16 +606,12 @@ extern __inline __m256d ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(M))) #endif -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutevar8x32_ps(__m256 __X, __m256i __Y) { +__funline __m256 _mm256_permutevar8x32_ps(__m256 __X, __m256i __Y) { return (__m256)__builtin_ia32_permvarsf256((__v8sf)__X, (__v8si)__Y); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permute4x64_epi64(__m256i __X, const int __M) { +__funline __m256i _mm256_permute4x64_epi64(__m256i __X, const int __M) { return (__m256i)__builtin_ia32_permdi256((__v4di)__X, __M); } #else @@ -892,9 +620,8 @@ extern __inline __m256i #endif #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permute2x128_si256(__m256i __X, __m256i __Y, const int __M) { +__funline __m256i _mm256_permute2x128_si256(__m256i __X, __m256i __Y, + const int __M) { return (__m256i)__builtin_ia32_permti256((__v4di)__X, (__v4di)__Y, __M); } #else @@ -904,9 +631,7 @@ extern __inline __m256i #endif #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extracti128_si256(__m256i __X, const int __M) { +__funline __m128i _mm256_extracti128_si256(__m256i __X, const int __M) { return (__m128i)__builtin_ia32_extract128i256((__v4di)__X, __M); } #else @@ -915,9 +640,8 @@ extern __inline __m128i #endif #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_inserti128_si256(__m256i __X, __m128i __Y, const int __M) { +__funline __m256i _mm256_inserti128_si256(__m256i __X, __m128i __Y, + const int __M) { return (__m256i)__builtin_ia32_insert128i256((__v4di)__X, (__v2di)__Y, __M); } #else @@ -926,118 +650,81 @@ extern __inline __m256i (__v2di)(__m128i)(Y), (int)(M))) #endif -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskload_epi32(int const *__X, __m256i __M) { +__funline __m256i _mm256_maskload_epi32(int const *__X, __m256i __M) { return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskload_epi64(long long const *__X, __m256i __M) { +__funline __m256i _mm256_maskload_epi64(long long const *__X, __m256i __M) { return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskload_epi32(int const *__X, __m128i __M) { +__funline __m128i _mm_maskload_epi32(int const *__X, __m128i __M) { return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskload_epi64(long long const *__X, __m128i __M) { +__funline __m128i _mm_maskload_epi64(long long const *__X, __m128i __M) { return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) { +__funline void _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) { __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) { +__funline void _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) { __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) { +__funline void _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) { __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) { +__funline void _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) { __builtin_ia32_maskstoreq((__v2di *)__X, (__v2di)__M, (__v2di)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sllv_epi32(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_sllv_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sllv_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_sllv_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sllv_epi64(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_sllv_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sllv_epi64(__m128i __X, __m128i __Y) { +__funline __m128i _mm_sllv_epi64(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srav_epi32(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_srav_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srav_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_srav_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srlv_epi32(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_srlv_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srlv_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_srlv_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srlv_epi64(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_srlv_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srlv_epi64(__m128i __X, __m128i __Y) { +__funline __m128i _mm_srlv_epi64(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i32gather_pd(double const *__base, __m128i __index, const int __scale) { +__funline __m128d _mm_i32gather_pd(double const *__base, __m128i __index, + const int __scale) { __v2df __zero = _mm_setzero_pd(); __v2df __mask = _mm_cmpeq_pd(__zero, __zero); @@ -1045,277 +732,238 @@ extern __inline __m128d (__v4si)__index, __mask, __scale); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i32gather_pd(__m128d __src, double const *__base, __m128i __index, - __m128d __mask, const int __scale) { +__funline __m128d _mm_mask_i32gather_pd(__m128d __src, double const *__base, + __m128i __index, __m128d __mask, + const int __scale) { return (__m128d)__builtin_ia32_gathersiv2df( (__v2df)__src, __base, (__v4si)__index, (__v2df)__mask, __scale); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_i32gather_pd(double const *__base, __m128i __index, const int __scale) { +__funline __m256d _mm256_i32gather_pd(double const *__base, __m128i __index, + const int __scale) { __v4df __zero = _mm256_setzero_pd(); __v4df __mask = _mm256_cmp_pd(__zero, __zero, _CMP_EQ_OQ); return (__m256d)__builtin_ia32_gathersiv4df(_mm256_undefined_pd(), __base, (__v4si)__index, __mask, __scale); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i32gather_pd(__m256d __src, double const *__base, - __m128i __index, __m256d __mask, - const int __scale) { +__funline __m256d _mm256_mask_i32gather_pd(__m256d __src, double const *__base, + __m128i __index, __m256d __mask, + const int __scale) { return (__m256d)__builtin_ia32_gathersiv4df( (__v4df)__src, __base, (__v4si)__index, (__v4df)__mask, __scale); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i64gather_pd(double const *__base, __m128i __index, const int __scale) { +__funline __m128d _mm_i64gather_pd(double const *__base, __m128i __index, + const int __scale) { __v2df __src = _mm_setzero_pd(); __v2df __mask = _mm_cmpeq_pd(__src, __src); return (__m128d)__builtin_ia32_gatherdiv2df(__src, __base, (__v2di)__index, __mask, __scale); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i64gather_pd(__m128d __src, double const *__base, __m128i __index, - __m128d __mask, const int __scale) { +__funline __m128d _mm_mask_i64gather_pd(__m128d __src, double const *__base, + __m128i __index, __m128d __mask, + const int __scale) { return (__m128d)__builtin_ia32_gatherdiv2df( (__v2df)__src, __base, (__v2di)__index, (__v2df)__mask, __scale); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_i64gather_pd(double const *__base, __m256i __index, const int __scale) { +__funline __m256d _mm256_i64gather_pd(double const *__base, __m256i __index, + const int __scale) { __v4df __src = _mm256_setzero_pd(); __v4df __mask = _mm256_cmp_pd(__src, __src, _CMP_EQ_OQ); return (__m256d)__builtin_ia32_gatherdiv4df(__src, __base, (__v4di)__index, __mask, __scale); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i64gather_pd(__m256d __src, double const *__base, - __m256i __index, __m256d __mask, - const int __scale) { +__funline __m256d _mm256_mask_i64gather_pd(__m256d __src, double const *__base, + __m256i __index, __m256d __mask, + const int __scale) { return (__m256d)__builtin_ia32_gatherdiv4df( (__v4df)__src, __base, (__v4di)__index, (__v4df)__mask, __scale); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i32gather_ps(float const *__base, __m128i __index, const int __scale) { +__funline __m128 _mm_i32gather_ps(float const *__base, __m128i __index, + const int __scale) { __v4sf __src = _mm_setzero_ps(); __v4sf __mask = _mm_cmpeq_ps(__src, __src); return (__m128)__builtin_ia32_gathersiv4sf(__src, __base, (__v4si)__index, __mask, __scale); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i32gather_ps(__m128 __src, float const *__base, __m128i __index, - __m128 __mask, const int __scale) { +__funline __m128 _mm_mask_i32gather_ps(__m128 __src, float const *__base, + __m128i __index, __m128 __mask, + const int __scale) { return (__m128)__builtin_ia32_gathersiv4sf( (__v4sf)__src, __base, (__v4si)__index, (__v4sf)__mask, __scale); } -extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_i32gather_ps(float const *__base, __m256i __index, const int __scale) { +__funline __m256 _mm256_i32gather_ps(float const *__base, __m256i __index, + const int __scale) { __v8sf __src = _mm256_setzero_ps(); __v8sf __mask = _mm256_cmp_ps(__src, __src, _CMP_EQ_OQ); return (__m256)__builtin_ia32_gathersiv8sf(__src, __base, (__v8si)__index, __mask, __scale); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i32gather_ps(__m256 __src, float const *__base, __m256i __index, - __m256 __mask, const int __scale) { +__funline __m256 _mm256_mask_i32gather_ps(__m256 __src, float const *__base, + __m256i __index, __m256 __mask, + const int __scale) { return (__m256)__builtin_ia32_gathersiv8sf( (__v8sf)__src, __base, (__v8si)__index, (__v8sf)__mask, __scale); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i64gather_ps(float const *__base, __m128i __index, const int __scale) { +__funline __m128 _mm_i64gather_ps(float const *__base, __m128i __index, + const int __scale) { __v4sf __src = _mm_setzero_ps(); __v4sf __mask = _mm_cmpeq_ps(__src, __src); return (__m128)__builtin_ia32_gatherdiv4sf(__src, __base, (__v2di)__index, __mask, __scale); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i64gather_ps(__m128 __src, float const *__base, __m128i __index, - __m128 __mask, const int __scale) { +__funline __m128 _mm_mask_i64gather_ps(__m128 __src, float const *__base, + __m128i __index, __m128 __mask, + const int __scale) { return (__m128)__builtin_ia32_gatherdiv4sf( (__v4sf)__src, __base, (__v2di)__index, (__v4sf)__mask, __scale); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_i64gather_ps(float const *__base, __m256i __index, const int __scale) { +__funline __m128 _mm256_i64gather_ps(float const *__base, __m256i __index, + const int __scale) { __v4sf __src = _mm_setzero_ps(); __v4sf __mask = _mm_cmpeq_ps(__src, __src); return (__m128)__builtin_ia32_gatherdiv4sf256(__src, __base, (__v4di)__index, __mask, __scale); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i64gather_ps(__m128 __src, float const *__base, __m256i __index, - __m128 __mask, const int __scale) { +__funline __m128 _mm256_mask_i64gather_ps(__m128 __src, float const *__base, + __m256i __index, __m128 __mask, + const int __scale) { return (__m128)__builtin_ia32_gatherdiv4sf256( (__v4sf)__src, __base, (__v4di)__index, (__v4sf)__mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i32gather_epi64(long long int const *__base, __m128i __index, - const int __scale) { +__funline __m128i _mm_i32gather_epi64(long long int const *__base, + __m128i __index, const int __scale) { __v2di __src = __extension__(__v2di){0, 0}; __v2di __mask = __extension__(__v2di){~0, ~0}; return (__m128i)__builtin_ia32_gathersiv2di(__src, __base, (__v4si)__index, __mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i32gather_epi64(__m128i __src, long long int const *__base, - __m128i __index, __m128i __mask, - const int __scale) { +__funline __m128i _mm_mask_i32gather_epi64(__m128i __src, + long long int const *__base, + __m128i __index, __m128i __mask, + const int __scale) { return (__m128i)__builtin_ia32_gathersiv2di( (__v2di)__src, __base, (__v4si)__index, (__v2di)__mask, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_i32gather_epi64(long long int const *__base, __m128i __index, - const int __scale) { +__funline __m256i _mm256_i32gather_epi64(long long int const *__base, + __m128i __index, const int __scale) { __v4di __src = __extension__(__v4di){0, 0, 0, 0}; __v4di __mask = __extension__(__v4di){~0, ~0, ~0, ~0}; return (__m256i)__builtin_ia32_gathersiv4di(__src, __base, (__v4si)__index, __mask, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i32gather_epi64(__m256i __src, long long int const *__base, - __m128i __index, __m256i __mask, - const int __scale) { +__funline __m256i _mm256_mask_i32gather_epi64(__m256i __src, + long long int const *__base, + __m128i __index, __m256i __mask, + const int __scale) { return (__m256i)__builtin_ia32_gathersiv4di( (__v4di)__src, __base, (__v4si)__index, (__v4di)__mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i64gather_epi64(long long int const *__base, __m128i __index, - const int __scale) { +__funline __m128i _mm_i64gather_epi64(long long int const *__base, + __m128i __index, const int __scale) { __v2di __src = __extension__(__v2di){0, 0}; __v2di __mask = __extension__(__v2di){~0, ~0}; return (__m128i)__builtin_ia32_gatherdiv2di(__src, __base, (__v2di)__index, __mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i64gather_epi64(__m128i __src, long long int const *__base, - __m128i __index, __m128i __mask, - const int __scale) { +__funline __m128i _mm_mask_i64gather_epi64(__m128i __src, + long long int const *__base, + __m128i __index, __m128i __mask, + const int __scale) { return (__m128i)__builtin_ia32_gatherdiv2di( (__v2di)__src, __base, (__v2di)__index, (__v2di)__mask, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_i64gather_epi64(long long int const *__base, __m256i __index, - const int __scale) { +__funline __m256i _mm256_i64gather_epi64(long long int const *__base, + __m256i __index, const int __scale) { __v4di __src = __extension__(__v4di){0, 0, 0, 0}; __v4di __mask = __extension__(__v4di){~0, ~0, ~0, ~0}; return (__m256i)__builtin_ia32_gatherdiv4di(__src, __base, (__v4di)__index, __mask, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i64gather_epi64(__m256i __src, long long int const *__base, - __m256i __index, __m256i __mask, - const int __scale) { +__funline __m256i _mm256_mask_i64gather_epi64(__m256i __src, + long long int const *__base, + __m256i __index, __m256i __mask, + const int __scale) { return (__m256i)__builtin_ia32_gatherdiv4di( (__v4di)__src, __base, (__v4di)__index, (__v4di)__mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i32gather_epi32(int const *__base, __m128i __index, const int __scale) { +__funline __m128i _mm_i32gather_epi32(int const *__base, __m128i __index, + const int __scale) { __v4si __src = __extension__(__v4si){0, 0, 0, 0}; __v4si __mask = __extension__(__v4si){~0, ~0, ~0, ~0}; return (__m128i)__builtin_ia32_gathersiv4si(__src, __base, (__v4si)__index, __mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i32gather_epi32(__m128i __src, int const *__base, __m128i __index, - __m128i __mask, const int __scale) { +__funline __m128i _mm_mask_i32gather_epi32(__m128i __src, int const *__base, + __m128i __index, __m128i __mask, + const int __scale) { return (__m128i)__builtin_ia32_gathersiv4si( (__v4si)__src, __base, (__v4si)__index, (__v4si)__mask, __scale); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_i32gather_epi32(int const *__base, __m256i __index, const int __scale) { +__funline __m256i _mm256_i32gather_epi32(int const *__base, __m256i __index, + const int __scale) { __v8si __src = __extension__(__v8si){0, 0, 0, 0, 0, 0, 0, 0}; __v8si __mask = __extension__(__v8si){~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0}; return (__m256i)__builtin_ia32_gathersiv8si(__src, __base, (__v8si)__index, __mask, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i32gather_epi32(__m256i __src, int const *__base, - __m256i __index, __m256i __mask, - const int __scale) { +__funline __m256i _mm256_mask_i32gather_epi32(__m256i __src, int const *__base, + __m256i __index, __m256i __mask, + const int __scale) { return (__m256i)__builtin_ia32_gathersiv8si( (__v8si)__src, __base, (__v8si)__index, (__v8si)__mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i64gather_epi32(int const *__base, __m128i __index, const int __scale) { +__funline __m128i _mm_i64gather_epi32(int const *__base, __m128i __index, + const int __scale) { __v4si __src = __extension__(__v4si){0, 0, 0, 0}; __v4si __mask = __extension__(__v4si){~0, ~0, ~0, ~0}; return (__m128i)__builtin_ia32_gatherdiv4si(__src, __base, (__v2di)__index, __mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i64gather_epi32(__m128i __src, int const *__base, __m128i __index, - __m128i __mask, const int __scale) { +__funline __m128i _mm_mask_i64gather_epi32(__m128i __src, int const *__base, + __m128i __index, __m128i __mask, + const int __scale) { return (__m128i)__builtin_ia32_gatherdiv4si( (__v4si)__src, __base, (__v2di)__index, (__v4si)__mask, __scale); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_i64gather_epi32(int const *__base, __m256i __index, const int __scale) { +__funline __m128i _mm256_i64gather_epi32(int const *__base, __m256i __index, + const int __scale) { __v4si __src = __extension__(__v4si){0, 0, 0, 0}; __v4si __mask = __extension__(__v4si){~0, ~0, ~0, ~0}; return (__m128i)__builtin_ia32_gatherdiv4si256(__src, __base, (__v4di)__index, __mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i64gather_epi32(__m128i __src, int const *__base, - __m256i __index, __m128i __mask, - const int __scale) { +__funline __m128i _mm256_mask_i64gather_epi32(__m128i __src, int const *__base, + __m256i __index, __m128i __mask, + const int __scale) { return (__m128i)__builtin_ia32_gatherdiv4si256( (__v4si)__src, __base, (__v4di)__index, (__v4si)__mask, __scale); } diff --git a/third_party/intel/avx5124fmapsintrin.internal.h b/third_party/intel/avx5124fmapsintrin.internal.h index 0c421fbd5..289540f04 100644 --- a/third_party/intel/avx5124fmapsintrin.internal.h +++ b/third_party/intel/avx5124fmapsintrin.internal.h @@ -12,109 +12,93 @@ #define __DISABLE_AVX5124FMAPS__ #endif /* __AVX5124FMAPS__ */ -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_4fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __m512 __D, __m512 __E, - __m128 *__F) { +__funline __m512 _mm512_4fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __m512 __D, + __m512 __E, __m128 *__F) { return (__m512)__builtin_ia32_4fmaddps((__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, (const __v4sf *)__F); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_4fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C, - __m512 __D, __m512 __E, __m128 *__F) { +__funline __m512 _mm512_mask_4fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, __m512 __D, __m512 __E, + __m128 *__F) { return (__m512)__builtin_ia32_4fmaddps_mask( (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, (const __v4sf *)__F, (__v16sf)__A, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_4fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C, - __m512 __D, __m512 __E, __m128 *__F) { +__funline __m512 _mm512_maskz_4fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, __m512 __D, __m512 __E, + __m128 *__F) { return (__m512)__builtin_ia32_4fmaddps_mask( (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, (const __v4sf *)__F, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_4fmadd_ss(__m128 __A, __m128 __B, __m128 __C, __m128 __D, __m128 __E, - __m128 *__F) { +__funline __m128 _mm_4fmadd_ss(__m128 __A, __m128 __B, __m128 __C, __m128 __D, + __m128 __E, __m128 *__F) { return (__m128)__builtin_ia32_4fmaddss((__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, (const __v4sf *)__F); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_4fmadd_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C, - __m128 __D, __m128 __E, __m128 *__F) { +__funline __m128 _mm_mask_4fmadd_ss(__m128 __A, __mmask8 __U, __m128 __B, + __m128 __C, __m128 __D, __m128 __E, + __m128 *__F) { return (__m128)__builtin_ia32_4fmaddss_mask( (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, (const __v4sf *)__F, (__v4sf)__A, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_4fmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C, - __m128 __D, __m128 __E, __m128 *__F) { +__funline __m128 _mm_maskz_4fmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C, __m128 __D, __m128 __E, + __m128 *__F) { return (__m128)__builtin_ia32_4fmaddss_mask( (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, (const __v4sf *)__F, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_4fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __m512 __D, - __m512 __E, __m128 *__F) { +__funline __m512 _mm512_4fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __m512 __D, + __m512 __E, __m128 *__F) { return (__m512)__builtin_ia32_4fnmaddps((__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, (const __v4sf *)__F); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_4fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C, - __m512 __D, __m512 __E, __m128 *__F) { +__funline __m512 _mm512_mask_4fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, __m512 __D, __m512 __E, + __m128 *__F) { return (__m512)__builtin_ia32_4fnmaddps_mask( (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, (const __v4sf *)__F, (__v16sf)__A, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_4fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C, - __m512 __D, __m512 __E, __m128 *__F) { +__funline __m512 _mm512_maskz_4fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, __m512 __D, __m512 __E, + __m128 *__F) { return (__m512)__builtin_ia32_4fnmaddps_mask( (__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A, (const __v4sf *)__F, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_4fnmadd_ss(__m128 __A, __m128 __B, __m128 __C, __m128 __D, __m128 __E, - __m128 *__F) { +__funline __m128 _mm_4fnmadd_ss(__m128 __A, __m128 __B, __m128 __C, __m128 __D, + __m128 __E, __m128 *__F) { return (__m128)__builtin_ia32_4fnmaddss((__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, (const __v4sf *)__F); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_4fnmadd_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C, - __m128 __D, __m128 __E, __m128 *__F) { +__funline __m128 _mm_mask_4fnmadd_ss(__m128 __A, __mmask8 __U, __m128 __B, + __m128 __C, __m128 __D, __m128 __E, + __m128 *__F) { return (__m128)__builtin_ia32_4fnmaddss_mask( (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, (const __v4sf *)__F, (__v4sf)__A, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_4fnmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C, - __m128 __D, __m128 __E, __m128 *__F) { +__funline __m128 _mm_maskz_4fnmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C, __m128 __D, __m128 __E, + __m128 *__F) { return (__m128)__builtin_ia32_4fnmaddss_mask( (__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A, (const __v4sf *)__F, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); diff --git a/third_party/intel/avx5124vnniwintrin.internal.h b/third_party/intel/avx5124vnniwintrin.internal.h index f3c027f16..71ea91c09 100644 --- a/third_party/intel/avx5124vnniwintrin.internal.h +++ b/third_party/intel/avx5124vnniwintrin.internal.h @@ -12,59 +12,50 @@ #define __DISABLE_AVX5124VNNIW__ #endif /* __AVX5124VNNIW__ */ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_4dpwssd_epi32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, - __m512i __E, __m128i *__F) { +__funline __m512i _mm512_4dpwssd_epi32(__m512i __A, __m512i __B, __m512i __C, + __m512i __D, __m512i __E, __m128i *__F) { return (__m512i)__builtin_ia32_vp4dpwssd((__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, (const __v4si *)__F); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_4dpwssd_epi32(__m512i __A, __mmask16 __U, __m512i __B, - __m512i __C, __m512i __D, __m512i __E, - __m128i *__F) { +__funline __m512i _mm512_mask_4dpwssd_epi32(__m512i __A, __mmask16 __U, + __m512i __B, __m512i __C, __m512i __D, + __m512i __E, __m128i *__F) { return (__m512i)__builtin_ia32_vp4dpwssd_mask( (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, (const __v4si *)__F, (__v16si)__A, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_4dpwssd_epi32(__mmask16 __U, __m512i __A, __m512i __B, - __m512i __C, __m512i __D, __m512i __E, - __m128i *__F) { +__funline __m512i _mm512_maskz_4dpwssd_epi32(__mmask16 __U, __m512i __A, + __m512i __B, __m512i __C, + __m512i __D, __m512i __E, + __m128i *__F) { return (__m512i)__builtin_ia32_vp4dpwssd_mask( (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, (const __v4si *)__F, (__v16si)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_4dpwssds_epi32(__m512i __A, __m512i __B, __m512i __C, __m512i __D, - __m512i __E, __m128i *__F) { +__funline __m512i _mm512_4dpwssds_epi32(__m512i __A, __m512i __B, __m512i __C, + __m512i __D, __m512i __E, __m128i *__F) { return (__m512i)__builtin_ia32_vp4dpwssds((__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, (const __v4si *)__F); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_4dpwssds_epi32(__m512i __A, __mmask16 __U, __m512i __B, - __m512i __C, __m512i __D, __m512i __E, - __m128i *__F) { +__funline __m512i _mm512_mask_4dpwssds_epi32(__m512i __A, __mmask16 __U, + __m512i __B, __m512i __C, + __m512i __D, __m512i __E, + __m128i *__F) { return (__m512i)__builtin_ia32_vp4dpwssds_mask( (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, (const __v4si *)__F, (__v16si)__A, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_4dpwssds_epi32(__mmask16 __U, __m512i __A, __m512i __B, - __m512i __C, __m512i __D, __m512i __E, - __m128i *__F) { +__funline __m512i _mm512_maskz_4dpwssds_epi32(__mmask16 __U, __m512i __A, + __m512i __B, __m512i __C, + __m512i __D, __m512i __E, + __m128i *__F) { return (__m512i)__builtin_ia32_vp4dpwssds_mask( (__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A, (const __v4si *)__F, (__v16si)_mm512_setzero_ps(), (__mmask16)__U); diff --git a/third_party/intel/avx512bitalgintrin.internal.h b/third_party/intel/avx512bitalgintrin.internal.h index e8ea2bd82..7abb14c7b 100644 --- a/third_party/intel/avx512bitalgintrin.internal.h +++ b/third_party/intel/avx512bitalgintrin.internal.h @@ -12,15 +12,11 @@ #define __DISABLE_AVX512BITALG__ #endif /* __AVX512BITALG__ */ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_popcnt_epi8(__m512i __A) { +__funline __m512i _mm512_popcnt_epi8(__m512i __A) { return (__m512i)__builtin_ia32_vpopcountb_v64qi((__v64qi)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_popcnt_epi16(__m512i __A) { +__funline __m512i _mm512_popcnt_epi16(__m512i __A) { return (__m512i)__builtin_ia32_vpopcountw_v32hi((__v32hi)__A); } @@ -35,43 +31,34 @@ extern __inline __m512i #define __DISABLE_AVX512BITALGBW__ #endif /* __AVX512VLBW__ */ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) { +__funline __m512i _mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, + __m512i __B) { return (__m512i)__builtin_ia32_vpopcountb_v64qi_mask( (__v64qi)__A, (__v64qi)__B, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __A) { +__funline __m512i _mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __A) { return (__m512i)__builtin_ia32_vpopcountb_v64qi_mask( (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) { +__funline __m512i _mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, + __m512i __B) { return (__m512i)__builtin_ia32_vpopcountw_v32hi_mask( (__v32hi)__A, (__v32hi)__B, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __A) { +__funline __m512i _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __A) { return (__m512i)__builtin_ia32_vpopcountw_v32hi_mask( (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) { +__funline __mmask64 _mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) { return (__mmask64)__builtin_ia32_vpshufbitqmb512_mask( (__v64qi)__A, (__v64qi)__B, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_bitshuffle_epi64_mask(__mmask64 __M, __m512i __A, __m512i __B) { +__funline __mmask64 _mm512_mask_bitshuffle_epi64_mask(__mmask64 __M, __m512i __A, + __m512i __B) { return (__mmask64)__builtin_ia32_vpshufbitqmb512_mask( (__v64qi)__A, (__v64qi)__B, (__mmask64)__M); } @@ -88,30 +75,24 @@ extern __inline __mmask64 #define __DISABLE_AVX512BITALGVLBW__ #endif /* __AVX512VLBW__ */ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) { +__funline __m256i _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, + __m256i __B) { return (__m256i)__builtin_ia32_vpopcountb_v32qi_mask( (__v32qi)__A, (__v32qi)__B, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __A) { +__funline __m256i _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __A) { return (__m256i)__builtin_ia32_vpopcountb_v32qi_mask( (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B) { +__funline __mmask32 _mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B) { return (__mmask32)__builtin_ia32_vpshufbitqmb256_mask( (__v32qi)__A, (__v32qi)__B, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_bitshuffle_epi64_mask(__mmask32 __M, __m256i __A, __m256i __B) { +__funline __mmask32 _mm256_mask_bitshuffle_epi64_mask(__mmask32 __M, __m256i __A, + __m256i __B) { return (__mmask32)__builtin_ia32_vpshufbitqmb256_mask( (__v32qi)__A, (__v32qi)__B, (__mmask32)__M); } @@ -127,81 +108,59 @@ extern __inline __mmask32 #define __DISABLE_AVX512BITALGVL__ #endif /* __AVX512VLBW__ */ -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) { +__funline __mmask16 _mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) { return (__mmask16)__builtin_ia32_vpshufbitqmb128_mask( (__v16qi)__A, (__v16qi)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_bitshuffle_epi64_mask(__mmask16 __M, __m128i __A, __m128i __B) { +__funline __mmask16 _mm_mask_bitshuffle_epi64_mask(__mmask16 __M, __m128i __A, + __m128i __B) { return (__mmask16)__builtin_ia32_vpshufbitqmb128_mask( (__v16qi)__A, (__v16qi)__B, (__mmask16)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_popcnt_epi8(__m256i __A) { +__funline __m256i _mm256_popcnt_epi8(__m256i __A) { return (__m256i)__builtin_ia32_vpopcountb_v32qi((__v32qi)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_popcnt_epi16(__m256i __A) { +__funline __m256i _mm256_popcnt_epi16(__m256i __A) { return (__m256i)__builtin_ia32_vpopcountw_v16hi((__v16hi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_popcnt_epi8(__m128i __A) { +__funline __m128i _mm_popcnt_epi8(__m128i __A) { return (__m128i)__builtin_ia32_vpopcountb_v16qi((__v16qi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_popcnt_epi16(__m128i __A) { +__funline __m128i _mm_popcnt_epi16(__m128i __A) { return (__m128i)__builtin_ia32_vpopcountw_v8hi((__v8hi)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) { +__funline __m256i _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, + __m256i __B) { return (__m256i)__builtin_ia32_vpopcountw_v16hi_mask( (__v16hi)__A, (__v16hi)__B, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __A) { +__funline __m256i _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __A) { return (__m256i)__builtin_ia32_vpopcountw_v16hi_mask( (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) { +__funline __m128i _mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) { return (__m128i)__builtin_ia32_vpopcountb_v16qi_mask( (__v16qi)__A, (__v16qi)__B, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __A) { +__funline __m128i _mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __A) { return (__m128i)__builtin_ia32_vpopcountb_v16qi_mask( (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) { +__funline __m128i _mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_vpopcountw_v8hi_mask((__v8hi)__A, (__v8hi)__B, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_vpopcountw_v8hi_mask( (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } diff --git a/third_party/intel/avx512bwintrin.internal.h b/third_party/intel/avx512bwintrin.internal.h index 86356d265..235bb541b 100644 --- a/third_party/intel/avx512bwintrin.internal.h +++ b/third_party/intel/avx512bwintrin.internal.h @@ -16,642 +16,478 @@ typedef char __v64qi __attribute__((__vector_size__(64))); typedef unsigned long long __mmask64; -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__CF) { +__funline unsigned char _ktest_mask32_u8(__mmask32 __A, __mmask32 __B, + unsigned char *__CF) { *__CF = (unsigned char)__builtin_ia32_ktestcsi(__A, __B); return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__CF) { +__funline unsigned char _ktest_mask64_u8(__mmask64 __A, __mmask64 __B, + unsigned char *__CF) { *__CF = (unsigned char)__builtin_ia32_ktestcdi(__A, __B); return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) { +__funline unsigned char _ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) { return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) { +__funline unsigned char _ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) { return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) { +__funline unsigned char _ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) { return (unsigned char)__builtin_ia32_ktestcsi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) { +__funline unsigned char _ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) { return (unsigned char)__builtin_ia32_ktestcdi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__CF) { +__funline unsigned char _kortest_mask32_u8(__mmask32 __A, __mmask32 __B, + unsigned char *__CF) { *__CF = (unsigned char)__builtin_ia32_kortestcsi(__A, __B); return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__CF) { +__funline unsigned char _kortest_mask64_u8(__mmask64 __A, __mmask64 __B, + unsigned char *__CF) { *__CF = (unsigned char)__builtin_ia32_kortestcdi(__A, __B); return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) { +__funline unsigned char _kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) { return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) { +__funline unsigned char _kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) { return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) { +__funline unsigned char _kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) { return (unsigned char)__builtin_ia32_kortestcsi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) { +__funline unsigned char _kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) { return (unsigned char)__builtin_ia32_kortestcdi(__A, __B); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kadd_mask32(__mmask32 __A, __mmask32 __B) { +__funline __mmask32 _kadd_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kadd_mask64(__mmask64 __A, __mmask64 __B) { +__funline __mmask64 _kadd_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cvtmask32_u32(__mmask32 __A) { +__funline unsigned int _cvtmask32_u32(__mmask32 __A) { return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cvtmask64_u64(__mmask64 __A) { +__funline unsigned long long _cvtmask64_u64(__mmask64 __A) { return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cvtu32_mask32(unsigned int __A) { +__funline __mmask32 _cvtu32_mask32(unsigned int __A) { return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cvtu64_mask64(unsigned long long __A) { +__funline __mmask64 _cvtu64_mask64(unsigned long long __A) { return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _load_mask32(__mmask32 *__A) { +__funline __mmask32 _load_mask32(__mmask32 *__A) { return (__mmask32)__builtin_ia32_kmovd(*__A); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _load_mask64(__mmask64 *__A) { +__funline __mmask64 _load_mask64(__mmask64 *__A) { return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _store_mask32(__mmask32 *__A, __mmask32 __B) { +__funline void _store_mask32(__mmask32 *__A, __mmask32 __B) { *(__mmask32 *)__A = __builtin_ia32_kmovd(__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _store_mask64(__mmask64 *__A, __mmask64 __B) { +__funline void _store_mask64(__mmask64 *__A, __mmask64 __B) { *(__mmask64 *)__A = __builtin_ia32_kmovq(__B); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _knot_mask32(__mmask32 __A) { +__funline __mmask32 _knot_mask32(__mmask32 __A) { return (__mmask32)__builtin_ia32_knotsi((__mmask32)__A); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _knot_mask64(__mmask64 __A) { +__funline __mmask64 _knot_mask64(__mmask64 __A) { return (__mmask64)__builtin_ia32_knotdi((__mmask64)__A); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kor_mask32(__mmask32 __A, __mmask32 __B) { +__funline __mmask32 _kor_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kor_mask64(__mmask64 __A, __mmask64 __B) { +__funline __mmask64 _kor_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kxnor_mask32(__mmask32 __A, __mmask32 __B) { +__funline __mmask32 _kxnor_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kxnor_mask64(__mmask64 __A, __mmask64 __B) { +__funline __mmask64 _kxnor_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kxor_mask32(__mmask32 __A, __mmask32 __B) { +__funline __mmask32 _kxor_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kxor_mask64(__mmask64 __A, __mmask64 __B) { +__funline __mmask64 _kxor_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kand_mask32(__mmask32 __A, __mmask32 __B) { +__funline __mmask32 _kand_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kand_mask64(__mmask64 __A, __mmask64 __B) { +__funline __mmask64 _kand_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kandn_mask32(__mmask32 __A, __mmask32 __B) { +__funline __mmask32 _kandn_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kandn_mask64(__mmask64 __A, __mmask64 __B) { +__funline __mmask64 _kandn_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) { +__funline __m512i _mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) { return (__m512i)__builtin_ia32_movdquhi512_mask((__v32hi)__A, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) { +__funline __m512i _mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) { return (__m512i)__builtin_ia32_movdquhi512_mask( (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) { +__funline __m512i _mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, + void const *__P) { return (__m512i)__builtin_ia32_loaddquhi512_mask( (const short *)__P, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_loadu_epi16(__mmask32 __U, void const *__P) { +__funline __m512i _mm512_maskz_loadu_epi16(__mmask32 __U, void const *__P) { return (__m512i)__builtin_ia32_loaddquhi512_mask( (const short *)__P, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_storeu_epi16(void *__P, __mmask32 __U, __m512i __A) { +__funline void _mm512_mask_storeu_epi16(void *__P, __mmask32 __U, __m512i __A) { __builtin_ia32_storedquhi512_mask((short *)__P, (__v32hi)__A, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) { +__funline __m512i _mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) { return (__m512i)__builtin_ia32_movdquqi512_mask((__v64qi)__A, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) { +__funline __m512i _mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) { return (__m512i)__builtin_ia32_movdquqi512_mask( (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kunpackw(__mmask32 __A, __mmask32 __B) { +__funline __mmask32 _mm512_kunpackw(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kunpcksi((__mmask32)__A, (__mmask32)__B); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kunpackw_mask32(__mmask16 __A, __mmask16 __B) { +__funline __mmask32 _kunpackw_mask32(__mmask16 __A, __mmask16 __B) { return (__mmask32)__builtin_ia32_kunpcksi((__mmask32)__A, (__mmask32)__B); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kunpackd(__mmask64 __A, __mmask64 __B) { +__funline __mmask64 _mm512_kunpackd(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kunpckdi((__mmask64)__A, (__mmask64)__B); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kunpackd_mask64(__mmask32 __A, __mmask32 __B) { +__funline __mmask64 _kunpackd_mask64(__mmask32 __A, __mmask32 __B) { return (__mmask64)__builtin_ia32_kunpckdi((__mmask64)__A, (__mmask64)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_loadu_epi8(__m512i __W, __mmask64 __U, void const *__P) { +__funline __m512i _mm512_mask_loadu_epi8(__m512i __W, __mmask64 __U, + void const *__P) { return (__m512i)__builtin_ia32_loaddquqi512_mask( (const char *)__P, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) { +__funline __m512i _mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) { return (__m512i)__builtin_ia32_loaddquqi512_mask( (const char *)__P, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_storeu_epi8(void *__P, __mmask64 __U, __m512i __A) { +__funline void _mm512_mask_storeu_epi8(void *__P, __mmask64 __U, __m512i __A) { __builtin_ia32_storedquqi512_mask((char *)__P, (__v64qi)__A, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sad_epu8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_sad_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psadbw512((__v64qi)__A, (__v64qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi16_epi8(__m512i __A) { +__funline __m256i _mm512_cvtepi16_epi8(__m512i __A) { return (__m256i)__builtin_ia32_pmovwb512_mask( (__v32hi)__A, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi16_storeu_epi8(void *__P, __mmask32 __M, __m512i __A) { +__funline void _mm512_mask_cvtepi16_storeu_epi8(void *__P, __mmask32 __M, + __m512i __A) { __builtin_ia32_pmovwb512mem_mask((__v32qi *)__P, (__v32hi)__A, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi16_epi8(__m256i __O, __mmask32 __M, __m512i __A) { +__funline __m256i _mm512_mask_cvtepi16_epi8(__m256i __O, __mmask32 __M, + __m512i __A) { return (__m256i)__builtin_ia32_pmovwb512_mask((__v32hi)__A, (__v32qi)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi16_epi8(__mmask32 __M, __m512i __A) { +__funline __m256i _mm512_maskz_cvtepi16_epi8(__mmask32 __M, __m512i __A) { return (__m256i)__builtin_ia32_pmovwb512_mask( (__v32hi)__A, (__v32qi)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtsepi16_epi8(__m512i __A) { +__funline __m256i _mm512_cvtsepi16_epi8(__m512i __A) { return (__m256i)__builtin_ia32_pmovswb512_mask( (__v32hi)__A, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi16_storeu_epi8(void *__P, __mmask32 __M, __m512i __A) { +__funline void _mm512_mask_cvtsepi16_storeu_epi8(void *__P, __mmask32 __M, + __m512i __A) { __builtin_ia32_pmovswb512mem_mask((__v32qi *)__P, (__v32hi)__A, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi16_epi8(__m256i __O, __mmask32 __M, __m512i __A) { +__funline __m256i _mm512_mask_cvtsepi16_epi8(__m256i __O, __mmask32 __M, + __m512i __A) { return (__m256i)__builtin_ia32_pmovswb512_mask((__v32hi)__A, (__v32qi)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtsepi16_epi8(__mmask32 __M, __m512i __A) { +__funline __m256i _mm512_maskz_cvtsepi16_epi8(__mmask32 __M, __m512i __A) { return (__m256i)__builtin_ia32_pmovswb512_mask( (__v32hi)__A, (__v32qi)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtusepi16_epi8(__m512i __A) { +__funline __m256i _mm512_cvtusepi16_epi8(__m512i __A) { return (__m256i)__builtin_ia32_pmovuswb512_mask( (__v32hi)__A, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi16_epi8(__m256i __O, __mmask32 __M, __m512i __A) { +__funline __m256i _mm512_mask_cvtusepi16_epi8(__m256i __O, __mmask32 __M, + __m512i __A) { return (__m256i)__builtin_ia32_pmovuswb512_mask((__v32hi)__A, (__v32qi)__O, __M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi16_storeu_epi8(void *__P, __mmask32 __M, __m512i __A) { +__funline void _mm512_mask_cvtusepi16_storeu_epi8(void *__P, __mmask32 __M, + __m512i __A) { __builtin_ia32_pmovuswb512mem_mask((__v32qi *)__P, (__v32hi)__A, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtusepi16_epi8(__mmask32 __M, __m512i __A) { +__funline __m256i _mm512_maskz_cvtusepi16_epi8(__mmask32 __M, __m512i __A) { return (__m256i)__builtin_ia32_pmovuswb512_mask( (__v32hi)__A, (__v32qi)_mm256_setzero_si256(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcastb_epi8(__m128i __A) { +__funline __m512i _mm512_broadcastb_epi8(__m128i __A) { return (__m512i)__builtin_ia32_pbroadcastb512_mask( (__v16qi)__A, (__v64qi)_mm512_undefined_epi32(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, __m128i __A) { +__funline __m512i _mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, + __m128i __A) { return (__m512i)__builtin_ia32_pbroadcastb512_mask((__v16qi)__A, (__v64qi)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) { +__funline __m512i _mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) { return (__m512i)__builtin_ia32_pbroadcastb512_mask( (__v16qi)__A, (__v64qi)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) { +__funline __m512i _mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) { return (__m512i)__builtin_ia32_pbroadcastb512_gpr_mask(__A, (__v64qi)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_set1_epi8(__mmask64 __M, char __A) { +__funline __m512i _mm512_maskz_set1_epi8(__mmask64 __M, char __A) { return (__m512i)__builtin_ia32_pbroadcastb512_gpr_mask( __A, (__v64qi)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcastw_epi16(__m128i __A) { +__funline __m512i _mm512_broadcastw_epi16(__m128i __A) { return (__m512i)__builtin_ia32_pbroadcastw512_mask( (__v8hi)__A, (__v32hi)_mm512_undefined_epi32(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) { +__funline __m512i _mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, + __m128i __A) { return (__m512i)__builtin_ia32_pbroadcastw512_mask((__v8hi)__A, (__v32hi)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) { +__funline __m512i _mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) { return (__m512i)__builtin_ia32_pbroadcastw512_mask( (__v8hi)__A, (__v32hi)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) { +__funline __m512i _mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) { return (__m512i)__builtin_ia32_pbroadcastw512_gpr_mask(__A, (__v32hi)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_set1_epi16(__mmask32 __M, short __A) { +__funline __m512i _mm512_maskz_set1_epi16(__mmask32 __M, short __A) { return (__m512i)__builtin_ia32_pbroadcastw512_gpr_mask( __A, (__v32hi)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mulhrs_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_mulhrs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmulhrsw512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmulhrsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmulhrsw512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mulhi_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_mulhi_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmulhw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmulhw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmulhw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mulhi_epu16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_mulhi_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmulhuw512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmulhuw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmulhuw512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mullo_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_mullo_epi16(__m512i __A, __m512i __B) { return (__m512i)((__v32hu)__A * (__v32hu)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmullw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmullw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi8_epi16(__m256i __A) { +__funline __m512i _mm512_cvtepi8_epi16(__m256i __A) { return (__m512i)__builtin_ia32_pmovsxbw512_mask( (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) { +__funline __m512i _mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, + __m256i __A) { return (__m512i)__builtin_ia32_pmovsxbw512_mask((__v32qi)__A, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) { +__funline __m512i _mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) { return (__m512i)__builtin_ia32_pmovsxbw512_mask( (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepu8_epi16(__m256i __A) { +__funline __m512i _mm512_cvtepu8_epi16(__m256i __A) { return (__m512i)__builtin_ia32_pmovzxbw512_mask( (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) { +__funline __m512i _mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, + __m256i __A) { return (__m512i)__builtin_ia32_pmovzxbw512_mask((__v32qi)__A, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) { +__funline __m512i _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) { return (__m512i)__builtin_ia32_pmovzxbw512_mask( (__v32qi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutexvar_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_permutexvar_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarhi512_mask( (__v32hi)__B, (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_permvarhi512_mask( (__v32hi)__B, (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) { +__funline __m512i _mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarhi512_mask( (__v32hi)__B, (__v32hi)__A, (__v32hi)__W, (__mmask32)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) { +__funline __m512i _mm512_permutex2var_epi16(__m512i __A, __m512i __I, + __m512i __B) { return (__m512i)__builtin_ia32_vpermt2varhi512_mask( (__v32hi)__I /* idx */, (__v32hi)__A, (__v32hi)__B, (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, - __m512i __B) { +__funline __m512i _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, + __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermt2varhi512_mask( (__v32hi)__I /* idx */, (__v32hi)__A, (__v32hi)__B, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, - __m512i __B) { +__funline __m512i _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, + __mmask32 __U, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varhi512_mask((__v32hi)__A, (__v32hi)__I /* idx */, @@ -659,1626 +495,1290 @@ extern __inline __m512i (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, - __m512i __B) { +__funline __m512i _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, + __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermt2varhi512_maskz( (__v32hi)__I /* idx */, (__v32hi)__A, (__v32hi)__B, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_avg_epu8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_avg_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pavgb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_avg_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_avg_epu8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pavgb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_avg_epu8(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_avg_epu8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pavgb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_add_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_add_epi8(__m512i __A, __m512i __B) { return (__m512i)((__v64qu)__A + (__v64qu)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_paddb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sub_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_sub_epi8(__m512i __A, __m512i __B) { return (__m512i)((__v64qu)__A - (__v64qu)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psubb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_avg_epu16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_avg_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pavgw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_avg_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_avg_epu16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pavgw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_avg_epu16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_avg_epu16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pavgw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_subs_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_subs_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psubsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_subs_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_subs_epi8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_subs_epi8(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_subs_epi8(__mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_subs_epu8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_subs_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psubusb512_mask( (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubusb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubusb512_mask( (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_adds_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_adds_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_paddsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_adds_epu8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_adds_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_paddusb512_mask( (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_adds_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_adds_epu8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddusb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddusb512_mask( (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sub_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_sub_epi16(__m512i __A, __m512i __B) { return (__m512i)((__v32hu)__A - (__v32hu)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_subs_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_subs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psubsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_subs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_subs_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_subs_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_subs_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_subs_epu16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_subs_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psubusw512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_subs_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_subs_epu16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubusw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubusw512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_add_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_add_epi16(__m512i __A, __m512i __B) { return (__m512i)((__v32hu)__A + (__v32hu)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_adds_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_adds_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_paddsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_adds_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_adds_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_adds_epu16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_adds_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_paddusw512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddusw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddusw512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srl_epi16(__m512i __A, __m128i __B) { +__funline __m512i _mm512_srl_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrlw512_mask((__v32hi)__A, (__v8hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psrlw512_mask((__v32hi)__A, (__v8hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psrlw512_mask((__v32hi)__A, (__v8hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_packs_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_packs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packsswb512_mask( (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sll_epi16(__m512i __A, __m128i __B) { +__funline __m512i _mm512_sll_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psllw512_mask((__v32hi)__A, (__v8hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psllw512_mask((__v32hi)__A, (__v8hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psllw512_mask((__v32hi)__A, (__v8hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maddubs_epi16(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_maddubs_epi16(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmaddubsw512_mask( (__v64qi)__X, (__v64qi)__Y, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, - __m512i __Y) { +__funline __m512i _mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, + __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmaddubsw512_mask( (__v64qi)__X, (__v64qi)__Y, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_pmaddubsw512_mask( (__v64qi)__X, (__v64qi)__Y, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_madd_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_madd_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaddwd512_mask( (__v32hi)__A, (__v32hi)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaddwd512_mask((__v32hi)__A, (__v32hi)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaddwd512_mask( (__v32hi)__A, (__v32hi)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpackhi_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_unpackhi_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpckhbw512_mask( (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) { +__funline __m512i _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpckhbw512_mask( (__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_punpckhbw512_mask( (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpackhi_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_unpackhi_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpckhwd512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { +__funline __m512i _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpckhwd512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_punpckhwd512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpacklo_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_unpacklo_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpcklbw512_mask( (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) { +__funline __m512i _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpcklbw512_mask( (__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_punpcklbw512_mask( (__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpacklo_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_unpacklo_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpcklwd512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) { +__funline __m512i _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpcklwd512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_punpcklwd512_mask( (__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpeq_epu8_mask(__m512i __A, __m512i __B) { +__funline __mmask64 _mm512_cmpeq_epu8_mask(__m512i __A, __m512i __B) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 0, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpeq_epi8_mask(__m512i __A, __m512i __B) { +__funline __mmask64 _mm512_cmpeq_epi8_mask(__m512i __A, __m512i __B) { return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__A, (__v64qi)__B, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpeq_epu8_mask(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __mmask64 _mm512_mask_cmpeq_epu8_mask(__mmask64 __U, __m512i __A, + __m512i __B) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 0, __U); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpeq_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __mmask64 _mm512_mask_cmpeq_epi8_mask(__mmask64 __U, __m512i __A, + __m512i __B) { return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__A, (__v64qi)__B, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpeq_epu16_mask(__m512i __A, __m512i __B) { +__funline __mmask32 _mm512_cmpeq_epu16_mask(__m512i __A, __m512i __B) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 0, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpeq_epi16_mask(__m512i __A, __m512i __B) { +__funline __mmask32 _mm512_cmpeq_epi16_mask(__m512i __A, __m512i __B) { return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__A, (__v32hi)__B, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpeq_epu16_mask(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __mmask32 _mm512_mask_cmpeq_epu16_mask(__mmask32 __U, __m512i __A, + __m512i __B) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 0, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpeq_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __mmask32 _mm512_mask_cmpeq_epi16_mask(__mmask32 __U, __m512i __A, + __m512i __B) { return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__A, (__v32hi)__B, __U); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpgt_epu8_mask(__m512i __A, __m512i __B) { +__funline __mmask64 _mm512_cmpgt_epu8_mask(__m512i __A, __m512i __B) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 6, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpgt_epi8_mask(__m512i __A, __m512i __B) { +__funline __mmask64 _mm512_cmpgt_epi8_mask(__m512i __A, __m512i __B) { return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__A, (__v64qi)__B, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpgt_epu8_mask(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __mmask64 _mm512_mask_cmpgt_epu8_mask(__mmask64 __U, __m512i __A, + __m512i __B) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__A, (__v64qi)__B, 6, __U); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpgt_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __mmask64 _mm512_mask_cmpgt_epi8_mask(__mmask64 __U, __m512i __A, + __m512i __B) { return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__A, (__v64qi)__B, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpgt_epu16_mask(__m512i __A, __m512i __B) { +__funline __mmask32 _mm512_cmpgt_epu16_mask(__m512i __A, __m512i __B) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 6, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpgt_epi16_mask(__m512i __A, __m512i __B) { +__funline __mmask32 _mm512_cmpgt_epi16_mask(__m512i __A, __m512i __B) { return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__A, (__v32hi)__B, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpgt_epu16_mask(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __mmask32 _mm512_mask_cmpgt_epu16_mask(__mmask32 __U, __m512i __A, + __m512i __B) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__A, (__v32hi)__B, 6, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpgt_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __mmask32 _mm512_mask_cmpgt_epi16_mask(__mmask32 __U, __m512i __A, + __m512i __B) { return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__A, (__v32hi)__B, __U); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_movepi8_mask(__m512i __A) { +__funline __mmask64 _mm512_movepi8_mask(__m512i __A) { return (__mmask64)__builtin_ia32_cvtb2mask512((__v64qi)__A); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_movepi16_mask(__m512i __A) { +__funline __mmask32 _mm512_movepi16_mask(__m512i __A) { return (__mmask32)__builtin_ia32_cvtw2mask512((__v32hi)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_movm_epi8(__mmask64 __A) { +__funline __m512i _mm512_movm_epi8(__mmask64 __A) { return (__m512i)__builtin_ia32_cvtmask2b512(__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_movm_epi16(__mmask32 __A) { +__funline __m512i _mm512_movm_epi16(__mmask32 __A) { return (__m512i)__builtin_ia32_cvtmask2w512(__A); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_test_epi8_mask(__m512i __A, __m512i __B) { +__funline __mmask64 _mm512_test_epi8_mask(__m512i __A, __m512i __B) { return (__mmask64)__builtin_ia32_ptestmb512((__v64qi)__A, (__v64qi)__B, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_test_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __mmask64 _mm512_mask_test_epi8_mask(__mmask64 __U, __m512i __A, + __m512i __B) { return (__mmask64)__builtin_ia32_ptestmb512((__v64qi)__A, (__v64qi)__B, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_test_epi16_mask(__m512i __A, __m512i __B) { +__funline __mmask32 _mm512_test_epi16_mask(__m512i __A, __m512i __B) { return (__mmask32)__builtin_ia32_ptestmw512((__v32hi)__A, (__v32hi)__B, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_test_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __mmask32 _mm512_mask_test_epi16_mask(__mmask32 __U, __m512i __A, + __m512i __B) { return (__mmask32)__builtin_ia32_ptestmw512((__v32hi)__A, (__v32hi)__B, __U); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_testn_epi8_mask(__m512i __A, __m512i __B) { +__funline __mmask64 _mm512_testn_epi8_mask(__m512i __A, __m512i __B) { return (__mmask64)__builtin_ia32_ptestnmb512((__v64qi)__A, (__v64qi)__B, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_testn_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __mmask64 _mm512_mask_testn_epi8_mask(__mmask64 __U, __m512i __A, + __m512i __B) { return (__mmask64)__builtin_ia32_ptestnmb512((__v64qi)__A, (__v64qi)__B, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_testn_epi16_mask(__m512i __A, __m512i __B) { +__funline __mmask32 _mm512_testn_epi16_mask(__m512i __A, __m512i __B) { return (__mmask32)__builtin_ia32_ptestnmw512((__v32hi)__A, (__v32hi)__B, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_testn_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __mmask32 _mm512_mask_testn_epi16_mask(__mmask32 __U, __m512i __A, + __m512i __B) { return (__mmask32)__builtin_ia32_ptestnmw512((__v32hi)__A, (__v32hi)__B, __U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shuffle_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_shuffle_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pshufb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pshufb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pshufb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_epu16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_min_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminuw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_epu16(__mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_min_epu16(__mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminuw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminuw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_min_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_epi16(__mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_min_epi16(__mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_min_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_min_epi16(__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_epu8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_max_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxub512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_epu8(__mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_max_epu8(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxub512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_max_epu8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_max_epu8(__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxub512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_max_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_epi8(__mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_max_epi8(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_max_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_max_epi8(__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_epu8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_min_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminub512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_epu8(__mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_min_epu8(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminub512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_min_epu8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_min_epu8(__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminub512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_min_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_epi8(__mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_min_epi8(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_min_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_min_epi8(__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminsb512_mask((__v64qi)__A, (__v64qi)__B, (__v64qi)__W, (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_max_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_epi16(__mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_max_epi16(__mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_max_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_max_epi16(__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxsw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_epu16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_max_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxuw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_epu16(__mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_max_epu16(__mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxuw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_max_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_max_epu16(__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxuw512_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sra_epi16(__m512i __A, __m128i __B) { +__funline __m512i _mm512_sra_epi16(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psraw512_mask((__v32hi)__A, (__v8hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psraw512_mask((__v32hi)__A, (__v8hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psraw512_mask((__v32hi)__A, (__v8hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srav_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_srav_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psrav32hi_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psrav32hi_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psrav32hi_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srlv_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_srlv_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psrlv32hi_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psrlv32hi_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psrlv32hi_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sllv_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_sllv_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psllv32hi_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_sllv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_sllv_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psllv32hi_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psllv32hi_mask((__v32hi)__A, (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_packsswb512_mask((__v32hi)__A, (__v32hi)__B, (__v64qi)__W, (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_packsswb512_mask( (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_packus_epi16(__m512i __A, __m512i __B) { +__funline __m512i _mm512_packus_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packuswb512_mask( (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packuswb512_mask((__v32hi)__A, (__v32hi)__B, (__v64qi)__W, (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_packuswb512_mask( (__v32hi)__A, (__v32hi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_abs_epi8(__m512i __A) { +__funline __m512i _mm512_abs_epi8(__m512i __A) { return (__m512i)__builtin_ia32_pabsb512_mask( (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_abs_epi8(__m512i __W, __mmask64 __U, __m512i __A) { +__funline __m512i _mm512_mask_abs_epi8(__m512i __W, __mmask64 __U, __m512i __A) { return (__m512i)__builtin_ia32_pabsb512_mask((__v64qi)__A, (__v64qi)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_abs_epi8(__mmask64 __U, __m512i __A) { +__funline __m512i _mm512_maskz_abs_epi8(__mmask64 __U, __m512i __A) { return (__m512i)__builtin_ia32_pabsb512_mask( (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_abs_epi16(__m512i __A) { +__funline __m512i _mm512_abs_epi16(__m512i __A) { return (__m512i)__builtin_ia32_pabsw512_mask( (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_abs_epi16(__m512i __W, __mmask32 __U, __m512i __A) { +__funline __m512i _mm512_mask_abs_epi16(__m512i __W, __mmask32 __U, __m512i __A) { return (__m512i)__builtin_ia32_pabsw512_mask((__v32hi)__A, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_abs_epi16(__mmask32 __U, __m512i __A) { +__funline __m512i _mm512_maskz_abs_epi16(__mmask32 __U, __m512i __A) { return (__m512i)__builtin_ia32_pabsw512_mask( (__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpneq_epu8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_mask_cmpneq_epu8_mask(__mmask64 __M, __m512i __X, + __m512i __Y) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, (__mmask64)__M); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmplt_epu8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_mask_cmplt_epu8_mask(__mmask64 __M, __m512i __X, + __m512i __Y) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, (__mmask64)__M); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpge_epu8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_mask_cmpge_epu8_mask(__mmask64 __M, __m512i __X, + __m512i __Y) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, (__mmask64)__M); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmple_epu8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_mask_cmple_epu8_mask(__mmask64 __M, __m512i __X, + __m512i __Y) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, (__mmask64)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpneq_epu16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_mask_cmpneq_epu16_mask(__mmask32 __M, __m512i __X, + __m512i __Y) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmplt_epu16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_mask_cmplt_epu16_mask(__mmask32 __M, __m512i __X, + __m512i __Y) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpge_epu16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_mask_cmpge_epu16_mask(__mmask32 __M, __m512i __X, + __m512i __Y) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmple_epu16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_mask_cmple_epu16_mask(__mmask32 __M, __m512i __X, + __m512i __Y) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, (__mmask32)__M); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpneq_epi8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_mask_cmpneq_epi8_mask(__mmask64 __M, __m512i __X, + __m512i __Y) { return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, (__mmask64)__M); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmplt_epi8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_mask_cmplt_epi8_mask(__mmask64 __M, __m512i __X, + __m512i __Y) { return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, (__mmask64)__M); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpge_epi8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_mask_cmpge_epi8_mask(__mmask64 __M, __m512i __X, + __m512i __Y) { return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, (__mmask64)__M); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmple_epi8_mask(__mmask64 __M, __m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_mask_cmple_epi8_mask(__mmask64 __M, __m512i __X, + __m512i __Y) { return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, (__mmask64)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpneq_epi16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_mask_cmpneq_epi16_mask(__mmask32 __M, __m512i __X, + __m512i __Y) { return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmplt_epi16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_mask_cmplt_epi16_mask(__mmask32 __M, __m512i __X, + __m512i __Y) { return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpge_epi16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_mask_cmpge_epi16_mask(__mmask32 __M, __m512i __X, + __m512i __Y) { return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmple_epi16_mask(__mmask32 __M, __m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_mask_cmple_epi16_mask(__mmask32 __M, __m512i __X, + __m512i __Y) { return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, (__mmask32)__M); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpneq_epu8_mask(__m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_cmpneq_epu8_mask(__m512i __X, __m512i __Y) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmplt_epu8_mask(__m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_cmplt_epu8_mask(__m512i __X, __m512i __Y) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpge_epu8_mask(__m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_cmpge_epu8_mask(__m512i __X, __m512i __Y) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmple_epu8_mask(__m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_cmple_epu8_mask(__m512i __X, __m512i __Y) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, (__mmask64)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpneq_epu16_mask(__m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_cmpneq_epu16_mask(__m512i __X, __m512i __Y) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmplt_epu16_mask(__m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_cmplt_epu16_mask(__m512i __X, __m512i __Y) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpge_epu16_mask(__m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_cmpge_epu16_mask(__m512i __X, __m512i __Y) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmple_epu16_mask(__m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_cmple_epu16_mask(__m512i __X, __m512i __Y) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, (__mmask32)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpneq_epi8_mask(__m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_cmpneq_epi8_mask(__m512i __X, __m512i __Y) { return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 4, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmplt_epi8_mask(__m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_cmplt_epi8_mask(__m512i __X, __m512i __Y) { return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 1, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpge_epi8_mask(__m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_cmpge_epi8_mask(__m512i __X, __m512i __Y) { return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 5, (__mmask64)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmple_epi8_mask(__m512i __X, __m512i __Y) { +__funline __mmask64 _mm512_cmple_epi8_mask(__m512i __X, __m512i __Y) { return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, 2, (__mmask64)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpneq_epi16_mask(__m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_cmpneq_epi16_mask(__m512i __X, __m512i __Y) { return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 4, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmplt_epi16_mask(__m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_cmplt_epi16_mask(__m512i __X, __m512i __Y) { return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 1, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpge_epi16_mask(__m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_cmpge_epi16_mask(__m512i __X, __m512i __Y) { return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 5, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmple_epi16_mask(__m512i __X, __m512i __Y) { +__funline __mmask32 _mm512_cmple_epi16_mask(__m512i __X, __m512i __Y) { return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, 2, (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_packs_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_packs_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packssdw512_mask( (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_packssdw512_mask( (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), __M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_packssdw512_mask((__v16si)__A, (__v16si)__B, (__v32hi)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_packus_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_packus_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packusdw512_mask( (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_packusdw512_mask( (__v16si)__A, (__v16si)__B, (__v32hi)_mm512_setzero_si512(), __M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packusdw512_mask((__v16si)__A, (__v16si)__B, (__v32hi)__W, __M); } #ifdef __OPTIMIZE__ -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kshiftli_mask32(__mmask32 __A, unsigned int __B) { +__funline __mmask32 _kshiftli_mask32(__mmask32 __A, unsigned int __B) { return (__mmask32)__builtin_ia32_kshiftlisi((__mmask32)__A, (__mmask8)__B); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kshiftli_mask64(__mmask64 __A, unsigned int __B) { +__funline __mmask64 _kshiftli_mask64(__mmask64 __A, unsigned int __B) { return (__mmask64)__builtin_ia32_kshiftlidi((__mmask64)__A, (__mmask8)__B); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kshiftri_mask32(__mmask32 __A, unsigned int __B) { +__funline __mmask32 _kshiftri_mask32(__mmask32 __A, unsigned int __B) { return (__mmask32)__builtin_ia32_kshiftrisi((__mmask32)__A, (__mmask8)__B); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kshiftri_mask64(__mmask64 __A, unsigned int __B) { +__funline __mmask64 _kshiftri_mask64(__mmask64 __A, unsigned int __B) { return (__mmask64)__builtin_ia32_kshiftridi((__mmask64)__A, (__mmask8)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_alignr_epi8(__m512i __A, __m512i __B, const int __N) { +__funline __m512i _mm512_alignr_epi8(__m512i __A, __m512i __B, const int __N) { return (__m512i)__builtin_ia32_palignr512((__v8di)__A, (__v8di)__B, __N * 8); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B, const int __N) { +__funline __m512i _mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B, const int __N) { return (__m512i)__builtin_ia32_palignr512_mask( (__v8di)__A, (__v8di)__B, __N * 8, (__v8di)__W, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A, __m512i __B, - const int __N) { +__funline __m512i _mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A, + __m512i __B, const int __N) { return (__m512i)__builtin_ia32_palignr512_mask( (__v8di)__A, (__v8di)__B, __N * 8, (__v8di)_mm512_setzero_si512(), (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_dbsad_epu8(__m512i __A, __m512i __B, const int __imm) { +__funline __m512i _mm512_dbsad_epu8(__m512i __A, __m512i __B, const int __imm) { return (__m512i)__builtin_ia32_dbpsadbw512_mask( (__v64qi)__A, (__v64qi)__B, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B, - const int __imm) { +__funline __m512i _mm512_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B, const int __imm) { return (__m512i)__builtin_ia32_dbpsadbw512_mask( (__v64qi)__A, (__v64qi)__B, __imm, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B, - const int __imm) { +__funline __m512i _mm512_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B, + const int __imm) { return (__m512i)__builtin_ia32_dbpsadbw512_mask( (__v64qi)__A, (__v64qi)__B, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srli_epi16(__m512i __A, const int __imm) { +__funline __m512i _mm512_srli_epi16(__m512i __A, const int __imm) { return (__m512i)__builtin_ia32_psrlwi512_mask( (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, - const int __imm) { +__funline __m512i _mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) { return (__m512i)__builtin_ia32_psrlwi512_mask((__v32hi)__A, __imm, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, const int __imm) { +__funline __m512i _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, + const int __imm) { return (__m512i)__builtin_ia32_psrlwi512_mask( (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_slli_epi16(__m512i __A, const int __B) { +__funline __m512i _mm512_slli_epi16(__m512i __A, const int __B) { return (__m512i)__builtin_ia32_psllwi512_mask( (__v32hi)__A, __B, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, const int __B) { +__funline __m512i _mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, + const int __B) { return (__m512i)__builtin_ia32_psllwi512_mask((__v32hi)__A, __B, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, const int __B) { +__funline __m512i _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, + const int __B) { return (__m512i)__builtin_ia32_psllwi512_mask( (__v32hi)__A, __B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shufflehi_epi16(__m512i __A, const int __imm) { +__funline __m512i _mm512_shufflehi_epi16(__m512i __A, const int __imm) { return (__m512i)__builtin_ia32_pshufhw512_mask( (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, __m512i __A, - const int __imm) { +__funline __m512i _mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, + __m512i __A, const int __imm) { return (__m512i)__builtin_ia32_pshufhw512_mask((__v32hi)__A, __imm, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A, const int __imm) { +__funline __m512i _mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A, + const int __imm) { return (__m512i)__builtin_ia32_pshufhw512_mask( (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shufflelo_epi16(__m512i __A, const int __imm) { +__funline __m512i _mm512_shufflelo_epi16(__m512i __A, const int __imm) { return (__m512i)__builtin_ia32_pshuflw512_mask( (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A, - const int __imm) { +__funline __m512i _mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, + __m512i __A, const int __imm) { return (__m512i)__builtin_ia32_pshuflw512_mask((__v32hi)__A, __imm, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A, const int __imm) { +__funline __m512i _mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A, + const int __imm) { return (__m512i)__builtin_ia32_pshuflw512_mask( (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srai_epi16(__m512i __A, const int __imm) { +__funline __m512i _mm512_srai_epi16(__m512i __A, const int __imm) { return (__m512i)__builtin_ia32_psrawi512_mask( (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, - const int __imm) { +__funline __m512i _mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) { return (__m512i)__builtin_ia32_psrawi512_mask((__v32hi)__A, __imm, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, const int __imm) { +__funline __m512i _mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, + const int __imm) { return (__m512i)__builtin_ia32_psrawi512_mask( (__v32hi)__A, __imm, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_blend_epi16(__mmask32 __U, __m512i __A, __m512i __W) { +__funline __m512i _mm512_mask_blend_epi16(__mmask32 __U, __m512i __A, + __m512i __W) { return (__m512i)__builtin_ia32_blendmw_512_mask((__v32hi)__A, (__v32hi)__W, (__mmask32)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_blend_epi8(__mmask64 __U, __m512i __A, __m512i __W) { +__funline __m512i _mm512_mask_blend_epi8(__mmask64 __U, __m512i __A, + __m512i __W) { return (__m512i)__builtin_ia32_blendmb_512_mask((__v64qi)__A, (__v64qi)__W, (__mmask64)__U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmp_epi16_mask(__mmask32 __U, __m512i __X, __m512i __Y, - const int __P) { +__funline __mmask32 _mm512_mask_cmp_epi16_mask(__mmask32 __U, __m512i __X, + __m512i __Y, const int __P) { return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, __P, (__mmask32)__U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_epi16_mask(__m512i __X, __m512i __Y, const int __P) { +__funline __mmask32 _mm512_cmp_epi16_mask(__m512i __X, __m512i __Y, + const int __P) { return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__X, (__v32hi)__Y, __P, (__mmask32)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmp_epi8_mask(__mmask64 __U, __m512i __X, __m512i __Y, - const int __P) { +__funline __mmask64 _mm512_mask_cmp_epi8_mask(__mmask64 __U, __m512i __X, + __m512i __Y, const int __P) { return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, __P, (__mmask64)__U); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_epi8_mask(__m512i __X, __m512i __Y, const int __P) { +__funline __mmask64 _mm512_cmp_epi8_mask(__m512i __X, __m512i __Y, + const int __P) { return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__X, (__v64qi)__Y, __P, (__mmask64)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmp_epu16_mask(__mmask32 __U, __m512i __X, __m512i __Y, - const int __P) { +__funline __mmask32 _mm512_mask_cmp_epu16_mask(__mmask32 __U, __m512i __X, + __m512i __Y, const int __P) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, __P, (__mmask32)__U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_epu16_mask(__m512i __X, __m512i __Y, const int __P) { +__funline __mmask32 _mm512_cmp_epu16_mask(__m512i __X, __m512i __Y, + const int __P) { return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__X, (__v32hi)__Y, __P, (__mmask32)-1); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmp_epu8_mask(__mmask64 __U, __m512i __X, __m512i __Y, - const int __P) { +__funline __mmask64 _mm512_mask_cmp_epu8_mask(__mmask64 __U, __m512i __X, + __m512i __Y, const int __P) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, __P, (__mmask64)__U); } -extern __inline __mmask64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_epu8_mask(__m512i __X, __m512i __Y, const int __P) { +__funline __mmask64 _mm512_cmp_epu8_mask(__m512i __X, __m512i __Y, + const int __P) { return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__X, (__v64qi)__Y, __P, (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_bslli_epi128(__m512i __A, const int __N) { +__funline __m512i _mm512_bslli_epi128(__m512i __A, const int __N) { return (__m512i)__builtin_ia32_pslldq512(__A, __N * 8); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_bsrli_epi128(__m512i __A, const int __N) { +__funline __m512i _mm512_bsrli_epi128(__m512i __A, const int __N) { return (__m512i)__builtin_ia32_psrldq512(__A, __N * 8); } diff --git a/third_party/intel/avx512cdintrin.internal.h b/third_party/intel/avx512cdintrin.internal.h index 685cf2c46..990347cff 100644 --- a/third_party/intel/avx512cdintrin.internal.h +++ b/third_party/intel/avx512cdintrin.internal.h @@ -20,99 +20,75 @@ typedef double __m512d __attribute__((__vector_size__(64), __may_alias__)); typedef unsigned char __mmask8; typedef unsigned short __mmask16; -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_conflict_epi32(__m512i __A) { +__funline __m512i _mm512_conflict_epi32(__m512i __A) { return (__m512i)__builtin_ia32_vpconflictsi_512_mask( (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_conflict_epi32(__m512i __W, __mmask16 __U, __m512i __A) { +__funline __m512i _mm512_mask_conflict_epi32(__m512i __W, __mmask16 __U, + __m512i __A) { return (__m512i)__builtin_ia32_vpconflictsi_512_mask( (__v16si)__A, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_conflict_epi32(__mmask16 __U, __m512i __A) { +__funline __m512i _mm512_maskz_conflict_epi32(__mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_vpconflictsi_512_mask( (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_conflict_epi64(__m512i __A) { +__funline __m512i _mm512_conflict_epi64(__m512i __A) { return (__m512i)__builtin_ia32_vpconflictdi_512_mask( (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_conflict_epi64(__m512i __W, __mmask8 __U, __m512i __A) { +__funline __m512i _mm512_mask_conflict_epi64(__m512i __W, __mmask8 __U, + __m512i __A) { return (__m512i)__builtin_ia32_vpconflictdi_512_mask((__v8di)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_conflict_epi64(__mmask8 __U, __m512i __A) { +__funline __m512i _mm512_maskz_conflict_epi64(__mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_vpconflictdi_512_mask( (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_lzcnt_epi64(__m512i __A) { +__funline __m512i _mm512_lzcnt_epi64(__m512i __A) { return (__m512i)__builtin_ia32_vplzcntq_512_mask( (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_lzcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) { +__funline __m512i _mm512_mask_lzcnt_epi64(__m512i __W, __mmask8 __U, + __m512i __A) { return (__m512i)__builtin_ia32_vplzcntq_512_mask((__v8di)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) { +__funline __m512i _mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_vplzcntq_512_mask( (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_lzcnt_epi32(__m512i __A) { +__funline __m512i _mm512_lzcnt_epi32(__m512i __A) { return (__m512i)__builtin_ia32_vplzcntd_512_mask( (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_lzcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) { +__funline __m512i _mm512_mask_lzcnt_epi32(__m512i __W, __mmask16 __U, + __m512i __A) { return (__m512i)__builtin_ia32_vplzcntd_512_mask((__v16si)__A, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_lzcnt_epi32(__mmask16 __U, __m512i __A) { +__funline __m512i _mm512_maskz_lzcnt_epi32(__mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_vplzcntd_512_mask( (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcastmb_epi64(__mmask8 __A) { +__funline __m512i _mm512_broadcastmb_epi64(__mmask8 __A) { return (__m512i)__builtin_ia32_broadcastmb512(__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcastmw_epi32(__mmask16 __A) { +__funline __m512i _mm512_broadcastmw_epi32(__mmask16 __A) { return (__m512i)__builtin_ia32_broadcastmw512(__A); } diff --git a/third_party/intel/avx512dqintrin.internal.h b/third_party/intel/avx512dqintrin.internal.h index fcd0ed97d..f6d2bc07f 100644 --- a/third_party/intel/avx512dqintrin.internal.h +++ b/third_party/intel/avx512dqintrin.internal.h @@ -11,1532 +11,1173 @@ #define __DISABLE_AVX512DQ__ #endif /* __AVX512DQ__ */ -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__CF) { +__funline unsigned char _ktest_mask8_u8(__mmask8 __A, __mmask8 __B, + unsigned char *__CF) { *__CF = (unsigned char)__builtin_ia32_ktestcqi(__A, __B); return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) { +__funline unsigned char _ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) { return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) { +__funline unsigned char _ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) { return (unsigned char)__builtin_ia32_ktestcqi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__CF) { +__funline unsigned char _ktest_mask16_u8(__mmask16 __A, __mmask16 __B, + unsigned char *__CF) { *__CF = (unsigned char)__builtin_ia32_ktestchi(__A, __B); return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) { +__funline unsigned char _ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) { return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) { +__funline unsigned char _ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) { return (unsigned char)__builtin_ia32_ktestchi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__CF) { +__funline unsigned char _kortest_mask8_u8(__mmask8 __A, __mmask8 __B, + unsigned char *__CF) { *__CF = (unsigned char)__builtin_ia32_kortestcqi(__A, __B); return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) { +__funline unsigned char _kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) { return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) { +__funline unsigned char _kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) { return (unsigned char)__builtin_ia32_kortestcqi(__A, __B); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kadd_mask8(__mmask8 __A, __mmask8 __B) { +__funline __mmask8 _kadd_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kadd_mask16(__mmask16 __A, __mmask16 __B) { +__funline __mmask16 _kadd_mask16(__mmask16 __A, __mmask16 __B) { return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cvtmask8_u32(__mmask8 __A) { +__funline unsigned int _cvtmask8_u32(__mmask8 __A) { return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cvtu32_mask8(unsigned int __A) { +__funline __mmask8 _cvtu32_mask8(unsigned int __A) { return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _load_mask8(__mmask8 *__A) { +__funline __mmask8 _load_mask8(__mmask8 *__A) { return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _store_mask8(__mmask8 *__A, __mmask8 __B) { +__funline void _store_mask8(__mmask8 *__A, __mmask8 __B) { *(__mmask8 *)__A = __builtin_ia32_kmovb(__B); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _knot_mask8(__mmask8 __A) { +__funline __mmask8 _knot_mask8(__mmask8 __A) { return (__mmask8)__builtin_ia32_knotqi((__mmask8)__A); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kor_mask8(__mmask8 __A, __mmask8 __B) { +__funline __mmask8 _kor_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kxnor_mask8(__mmask8 __A, __mmask8 __B) { +__funline __mmask8 _kxnor_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kxor_mask8(__mmask8 __A, __mmask8 __B) { +__funline __mmask8 _kxor_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kand_mask8(__mmask8 __A, __mmask8 __B) { +__funline __mmask8 _kand_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kandn_mask8(__mmask8 __A, __mmask8 __B) { +__funline __mmask8 _kandn_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcast_f64x2(__m128d __A) { +__funline __m512d _mm512_broadcast_f64x2(__m128d __A) { return (__m512d)__builtin_ia32_broadcastf64x2_512_mask( (__v2df)__A, _mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) { +__funline __m512d _mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, + __m128d __A) { return (__m512d)__builtin_ia32_broadcastf64x2_512_mask((__v2df)__A, (__v8df)__O, __M); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { +__funline __m512d _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_broadcastf64x2_512_mask( (__v2df)__A, (__v8df)_mm512_setzero_ps(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcast_i64x2(__m128i __A) { +__funline __m512i _mm512_broadcast_i64x2(__m128i __A) { return (__m512i)__builtin_ia32_broadcasti64x2_512_mask( (__v2di)__A, _mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) { +__funline __m512i _mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, + __m128i __A) { return (__m512i)__builtin_ia32_broadcasti64x2_512_mask((__v2di)__A, (__v8di)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { +__funline __m512i _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_broadcasti64x2_512_mask( (__v2di)__A, (__v8di)_mm512_setzero_si512(), __M); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcast_f32x2(__m128 __A) { +__funline __m512 _mm512_broadcast_f32x2(__m128 __A) { return (__m512)__builtin_ia32_broadcastf32x2_512_mask( (__v4sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) { +__funline __m512 _mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, + __m128 __A) { return (__m512)__builtin_ia32_broadcastf32x2_512_mask((__v4sf)__A, (__v16sf)__O, __M); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) { +__funline __m512 _mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_broadcastf32x2_512_mask( (__v4sf)__A, (__v16sf)_mm512_setzero_ps(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcast_i32x2(__m128i __A) { +__funline __m512i _mm512_broadcast_i32x2(__m128i __A) { return (__m512i)__builtin_ia32_broadcasti32x2_512_mask( (__v4si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) { +__funline __m512i _mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, + __m128i __A) { return (__m512i)__builtin_ia32_broadcasti32x2_512_mask((__v4si)__A, (__v16si)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) { +__funline __m512i _mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_broadcasti32x2_512_mask( (__v4si)__A, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcast_f32x8(__m256 __A) { +__funline __m512 _mm512_broadcast_f32x8(__m256 __A) { return (__m512)__builtin_ia32_broadcastf32x8_512_mask( (__v8sf)__A, _mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) { +__funline __m512 _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, + __m256 __A) { return (__m512)__builtin_ia32_broadcastf32x8_512_mask((__v8sf)__A, (__v16sf)__O, __M); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { +__funline __m512 _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { return (__m512)__builtin_ia32_broadcastf32x8_512_mask( (__v8sf)__A, (__v16sf)_mm512_setzero_ps(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcast_i32x8(__m256i __A) { +__funline __m512i _mm512_broadcast_i32x8(__m256i __A) { return (__m512i)__builtin_ia32_broadcasti32x8_512_mask( (__v8si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) { +__funline __m512i _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, + __m256i __A) { return (__m512i)__builtin_ia32_broadcasti32x8_512_mask((__v8si)__A, (__v16si)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { +__funline __m512i _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { return (__m512i)__builtin_ia32_broadcasti32x8_512_mask( (__v8si)__A, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mullo_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_mullo_epi64(__m512i __A, __m512i __B) { return (__m512i)((__v8du)__A * (__v8du)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmullq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmullq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_xor_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_xor_pd(__m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_xorpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_xorpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_xorpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_xor_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_xor_ps(__m512 __A, __m512 __B) { return (__m512)__builtin_ia32_xorps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_xorps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_xorps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_or_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_or_pd(__m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_orpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_orpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_orpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_or_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_or_ps(__m512 __A, __m512 __B) { return (__m512)__builtin_ia32_orps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_orps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_orps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_and_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_and_pd(__m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_andpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_andpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_andpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_and_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_and_ps(__m512 __A, __m512 __B) { return (__m512)__builtin_ia32_andps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_andps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_andps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_andnot_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_andnot_pd(__m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_andnpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_andnpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_andnpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_andnot_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_andnot_ps(__m512 __A, __m512 __B) { return (__m512)__builtin_ia32_andnps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_andnps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_andnps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_movepi32_mask(__m512i __A) { +__funline __mmask16 _mm512_movepi32_mask(__m512i __A) { return (__mmask16)__builtin_ia32_cvtd2mask512((__v16si)__A); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_movepi64_mask(__m512i __A) { +__funline __mmask8 _mm512_movepi64_mask(__m512i __A) { return (__mmask8)__builtin_ia32_cvtq2mask512((__v8di)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_movm_epi32(__mmask16 __A) { +__funline __m512i _mm512_movm_epi32(__mmask16 __A) { return (__m512i)__builtin_ia32_cvtmask2d512(__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_movm_epi64(__mmask8 __A) { +__funline __m512i _mm512_movm_epi64(__mmask8 __A) { return (__m512i)__builtin_ia32_cvtmask2q512(__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvttpd_epi64(__m512d __A) { +__funline __m512i _mm512_cvttpd_epi64(__m512d __A) { return (__m512i)__builtin_ia32_cvttpd2qq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvttpd_epi64(__m512i __W, __mmask8 __U, __m512d __A) { +__funline __m512i _mm512_mask_cvttpd_epi64(__m512i __W, __mmask8 __U, + __m512d __A) { return (__m512i)__builtin_ia32_cvttpd2qq512_mask( (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvttpd_epi64(__mmask8 __U, __m512d __A) { +__funline __m512i _mm512_maskz_cvttpd_epi64(__mmask8 __U, __m512d __A) { return (__m512i)__builtin_ia32_cvttpd2qq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvttpd_epu64(__m512d __A) { +__funline __m512i _mm512_cvttpd_epu64(__m512d __A) { return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvttpd_epu64(__m512i __W, __mmask8 __U, __m512d __A) { +__funline __m512i _mm512_mask_cvttpd_epu64(__m512i __W, __mmask8 __U, + __m512d __A) { return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvttpd_epu64(__mmask8 __U, __m512d __A) { +__funline __m512i _mm512_maskz_cvttpd_epu64(__mmask8 __U, __m512d __A) { return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvttps_epi64(__m256 __A) { +__funline __m512i _mm512_cvttps_epi64(__m256 __A) { return (__m512i)__builtin_ia32_cvttps2qq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvttps_epi64(__m512i __W, __mmask8 __U, __m256 __A) { +__funline __m512i _mm512_mask_cvttps_epi64(__m512i __W, __mmask8 __U, + __m256 __A) { return (__m512i)__builtin_ia32_cvttps2qq512_mask( (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvttps_epi64(__mmask8 __U, __m256 __A) { +__funline __m512i _mm512_maskz_cvttps_epi64(__mmask8 __U, __m256 __A) { return (__m512i)__builtin_ia32_cvttps2qq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvttps_epu64(__m256 __A) { +__funline __m512i _mm512_cvttps_epu64(__m256 __A) { return (__m512i)__builtin_ia32_cvttps2uqq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvttps_epu64(__m512i __W, __mmask8 __U, __m256 __A) { +__funline __m512i _mm512_mask_cvttps_epu64(__m512i __W, __mmask8 __U, + __m256 __A) { return (__m512i)__builtin_ia32_cvttps2uqq512_mask( (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvttps_epu64(__mmask8 __U, __m256 __A) { +__funline __m512i _mm512_maskz_cvttps_epu64(__mmask8 __U, __m256 __A) { return (__m512i)__builtin_ia32_cvttps2uqq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtpd_epi64(__m512d __A) { +__funline __m512i _mm512_cvtpd_epi64(__m512d __A) { return (__m512i)__builtin_ia32_cvtpd2qq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtpd_epi64(__m512i __W, __mmask8 __U, __m512d __A) { +__funline __m512i _mm512_mask_cvtpd_epi64(__m512i __W, __mmask8 __U, + __m512d __A) { return (__m512i)__builtin_ia32_cvtpd2qq512_mask( (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtpd_epi64(__mmask8 __U, __m512d __A) { +__funline __m512i _mm512_maskz_cvtpd_epi64(__mmask8 __U, __m512d __A) { return (__m512i)__builtin_ia32_cvtpd2qq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtpd_epu64(__m512d __A) { +__funline __m512i _mm512_cvtpd_epu64(__m512d __A) { return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtpd_epu64(__m512i __W, __mmask8 __U, __m512d __A) { +__funline __m512i _mm512_mask_cvtpd_epu64(__m512i __W, __mmask8 __U, + __m512d __A) { return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( (__v8df)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtpd_epu64(__mmask8 __U, __m512d __A) { +__funline __m512i _mm512_maskz_cvtpd_epu64(__mmask8 __U, __m512d __A) { return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtps_epi64(__m256 __A) { +__funline __m512i _mm512_cvtps_epi64(__m256 __A) { return (__m512i)__builtin_ia32_cvtps2qq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtps_epi64(__m512i __W, __mmask8 __U, __m256 __A) { +__funline __m512i _mm512_mask_cvtps_epi64(__m512i __W, __mmask8 __U, __m256 __A) { return (__m512i)__builtin_ia32_cvtps2qq512_mask( (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtps_epi64(__mmask8 __U, __m256 __A) { +__funline __m512i _mm512_maskz_cvtps_epi64(__mmask8 __U, __m256 __A) { return (__m512i)__builtin_ia32_cvtps2qq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtps_epu64(__m256 __A) { +__funline __m512i _mm512_cvtps_epu64(__m256 __A) { return (__m512i)__builtin_ia32_cvtps2uqq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtps_epu64(__m512i __W, __mmask8 __U, __m256 __A) { +__funline __m512i _mm512_mask_cvtps_epu64(__m512i __W, __mmask8 __U, __m256 __A) { return (__m512i)__builtin_ia32_cvtps2uqq512_mask( (__v8sf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtps_epu64(__mmask8 __U, __m256 __A) { +__funline __m512i _mm512_maskz_cvtps_epu64(__mmask8 __U, __m256 __A) { return (__m512i)__builtin_ia32_cvtps2uqq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi64_ps(__m512i __A) { +__funline __m256 _mm512_cvtepi64_ps(__m512i __A) { return (__m256)__builtin_ia32_cvtqq2ps512_mask( (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi64_ps(__m256 __W, __mmask8 __U, __m512i __A) { +__funline __m256 _mm512_mask_cvtepi64_ps(__m256 __W, __mmask8 __U, __m512i __A) { return (__m256)__builtin_ia32_cvtqq2ps512_mask( (__v8di)__A, (__v8sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi64_ps(__mmask8 __U, __m512i __A) { +__funline __m256 _mm512_maskz_cvtepi64_ps(__mmask8 __U, __m512i __A) { return (__m256)__builtin_ia32_cvtqq2ps512_mask( (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepu64_ps(__m512i __A) { +__funline __m256 _mm512_cvtepu64_ps(__m512i __A) { return (__m256)__builtin_ia32_cvtuqq2ps512_mask( (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepu64_ps(__m256 __W, __mmask8 __U, __m512i __A) { +__funline __m256 _mm512_mask_cvtepu64_ps(__m256 __W, __mmask8 __U, __m512i __A) { return (__m256)__builtin_ia32_cvtuqq2ps512_mask( (__v8di)__A, (__v8sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepu64_ps(__mmask8 __U, __m512i __A) { +__funline __m256 _mm512_maskz_cvtepu64_ps(__mmask8 __U, __m512i __A) { return (__m256)__builtin_ia32_cvtuqq2ps512_mask( (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi64_pd(__m512i __A) { +__funline __m512d _mm512_cvtepi64_pd(__m512i __A) { return (__m512d)__builtin_ia32_cvtqq2pd512_mask( (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi64_pd(__m512d __W, __mmask8 __U, __m512i __A) { +__funline __m512d _mm512_mask_cvtepi64_pd(__m512d __W, __mmask8 __U, + __m512i __A) { return (__m512d)__builtin_ia32_cvtqq2pd512_mask( (__v8di)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi64_pd(__mmask8 __U, __m512i __A) { +__funline __m512d _mm512_maskz_cvtepi64_pd(__mmask8 __U, __m512i __A) { return (__m512d)__builtin_ia32_cvtqq2pd512_mask( (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepu64_pd(__m512i __A) { +__funline __m512d _mm512_cvtepu64_pd(__m512i __A) { return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepu64_pd(__m512d __W, __mmask8 __U, __m512i __A) { +__funline __m512d _mm512_mask_cvtepu64_pd(__m512d __W, __mmask8 __U, + __m512i __A) { return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( (__v8di)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepu64_pd(__mmask8 __U, __m512i __A) { +__funline __m512d _mm512_maskz_cvtepu64_pd(__mmask8 __U, __m512i __A) { return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } #ifdef __OPTIMIZE__ -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kshiftli_mask8(__mmask8 __A, unsigned int __B) { +__funline __mmask8 _kshiftli_mask8(__mmask8 __A, unsigned int __B) { return (__mmask8)__builtin_ia32_kshiftliqi((__mmask8)__A, (__mmask8)__B); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kshiftri_mask8(__mmask8 __A, unsigned int __B) { +__funline __mmask8 _kshiftri_mask8(__mmask8 __A, unsigned int __B) { return (__mmask8)__builtin_ia32_kshiftriqi((__mmask8)__A, (__mmask8)__B); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_range_pd(__m512d __A, __m512d __B, int __C) { +__funline __m512d _mm512_range_pd(__m512d __A, __m512d __B, int __C) { return (__m512d)__builtin_ia32_rangepd512_mask( (__v8df)__A, (__v8df)__B, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_range_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B, - int __C) { +__funline __m512d _mm512_mask_range_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, int __C) { return (__m512d)__builtin_ia32_rangepd512_mask((__v8df)__A, (__v8df)__B, __C, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_range_pd(__mmask8 __U, __m512d __A, __m512d __B, int __C) { +__funline __m512d _mm512_maskz_range_pd(__mmask8 __U, __m512d __A, __m512d __B, + int __C) { return (__m512d)__builtin_ia32_rangepd512_mask( (__v8df)__A, (__v8df)__B, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_range_ps(__m512 __A, __m512 __B, int __C) { +__funline __m512 _mm512_range_ps(__m512 __A, __m512 __B, int __C) { return (__m512)__builtin_ia32_rangeps512_mask( (__v16sf)__A, (__v16sf)__B, __C, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_range_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, - int __C) { +__funline __m512 _mm512_mask_range_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, int __C) { return (__m512)__builtin_ia32_rangeps512_mask((__v16sf)__A, (__v16sf)__B, __C, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_range_ps(__mmask16 __U, __m512 __A, __m512 __B, int __C) { +__funline __m512 _mm512_maskz_range_ps(__mmask16 __U, __m512 __A, __m512 __B, + int __C) { return (__m512)__builtin_ia32_rangeps512_mask( (__v16sf)__A, (__v16sf)__B, __C, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_reduce_sd(__m128d __A, __m128d __B, int __C) { +__funline __m128d _mm_reduce_sd(__m128d __A, __m128d __B, int __C) { return (__m128d)__builtin_ia32_reducesd_mask( (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_reduce_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - int __C) { +__funline __m128d _mm_mask_reduce_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C) { return (__m128d)__builtin_ia32_reducesd_mask((__v2df)__A, (__v2df)__B, __C, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_reduce_sd(__mmask8 __U, __m128d __A, __m128d __B, int __C) { +__funline __m128d _mm_maskz_reduce_sd(__mmask8 __U, __m128d __A, __m128d __B, + int __C) { return (__m128d)__builtin_ia32_reducesd_mask( (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_reduce_ss(__m128 __A, __m128 __B, int __C) { +__funline __m128 _mm_reduce_ss(__m128 __A, __m128 __B, int __C) { return (__m128)__builtin_ia32_reducess_mask( (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_reduce_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C) { +__funline __m128 _mm_mask_reduce_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C) { return (__m128)__builtin_ia32_reducess_mask((__v4sf)__A, (__v4sf)__B, __C, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_reduce_ss(__mmask8 __U, __m128 __A, __m128 __B, int __C) { +__funline __m128 _mm_maskz_reduce_ss(__mmask8 __U, __m128 __A, __m128 __B, + int __C) { return (__m128)__builtin_ia32_reducess_mask( (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_range_sd(__m128d __A, __m128d __B, int __C) { +__funline __m128d _mm_range_sd(__m128d __A, __m128d __B, int __C) { return (__m128d)__builtin_ia32_rangesd128_mask_round( (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_range_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - int __C) { +__funline __m128d _mm_mask_range_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C) { return (__m128d)__builtin_ia32_rangesd128_mask_round( (__v2df)__A, (__v2df)__B, __C, (__v2df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_range_sd(__mmask8 __U, __m128d __A, __m128d __B, int __C) { +__funline __m128d _mm_maskz_range_sd(__mmask8 __U, __m128d __A, __m128d __B, + int __C) { return (__m128d)__builtin_ia32_rangesd128_mask_round( (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_range_ss(__m128 __A, __m128 __B, int __C) { +__funline __m128 _mm_range_ss(__m128 __A, __m128 __B, int __C) { return (__m128)__builtin_ia32_rangess128_mask_round( (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_range_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C) { +__funline __m128 _mm_mask_range_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C) { return (__m128)__builtin_ia32_rangess128_mask_round( (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_range_ss(__mmask8 __U, __m128 __A, __m128 __B, int __C) { +__funline __m128 _mm_maskz_range_ss(__mmask8 __U, __m128 __A, __m128 __B, + int __C) { return (__m128)__builtin_ia32_rangess128_mask_round( (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_range_round_sd(__m128d __A, __m128d __B, int __C, const int __R) { +__funline __m128d _mm_range_round_sd(__m128d __A, __m128d __B, int __C, + const int __R) { return (__m128d)__builtin_ia32_rangesd128_mask_round( (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_range_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - int __C, const int __R) { +__funline __m128d _mm_mask_range_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C, const int __R) { return (__m128d)__builtin_ia32_rangesd128_mask_round( (__v2df)__A, (__v2df)__B, __C, (__v2df)__W, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_range_round_sd(__mmask8 __U, __m128d __A, __m128d __B, int __C, - const int __R) { +__funline __m128d _mm_maskz_range_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + int __C, const int __R) { return (__m128d)__builtin_ia32_rangesd128_mask_round( (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_range_round_ss(__m128 __A, __m128 __B, int __C, const int __R) { +__funline __m128 _mm_range_round_ss(__m128 __A, __m128 __B, int __C, + const int __R) { return (__m128)__builtin_ia32_rangess128_mask_round( (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_range_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - int __C, const int __R) { +__funline __m128 _mm_mask_range_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C, const int __R) { return (__m128)__builtin_ia32_rangess128_mask_round( (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)__W, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_range_round_ss(__mmask8 __U, __m128 __A, __m128 __B, int __C, - const int __R) { +__funline __m128 _mm_maskz_range_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + int __C, const int __R) { return (__m128)__builtin_ia32_rangess128_mask_round( (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fpclass_ss_mask(__m128 __A, const int __imm) { +__funline __mmask8 _mm_fpclass_ss_mask(__m128 __A, const int __imm) { return (__mmask8)__builtin_ia32_fpclassss((__v4sf)__A, __imm); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fpclass_sd_mask(__m128d __A, const int __imm) { +__funline __mmask8 _mm_fpclass_sd_mask(__m128d __A, const int __imm) { return (__mmask8)__builtin_ia32_fpclasssd((__v2df)__A, __imm); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtt_roundpd_epi64(__m512d __A, const int __R) { +__funline __m512i _mm512_cvtt_roundpd_epi64(__m512d __A, const int __R) { return (__m512i)__builtin_ia32_cvttpd2qq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtt_roundpd_epi64(__m512i __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m512i _mm512_mask_cvtt_roundpd_epi64(__m512i __W, __mmask8 __U, + __m512d __A, const int __R) { return (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)__A, (__v8di)__W, (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtt_roundpd_epi64(__mmask8 __U, __m512d __A, const int __R) { +__funline __m512i _mm512_maskz_cvtt_roundpd_epi64(__mmask8 __U, __m512d __A, + const int __R) { return (__m512i)__builtin_ia32_cvttpd2qq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtt_roundpd_epu64(__m512d __A, const int __R) { +__funline __m512i _mm512_cvtt_roundpd_epu64(__m512d __A, const int __R) { return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtt_roundpd_epu64(__m512i __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m512i _mm512_mask_cvtt_roundpd_epu64(__m512i __W, __mmask8 __U, + __m512d __A, const int __R) { return (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)__A, (__v8di)__W, (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtt_roundpd_epu64(__mmask8 __U, __m512d __A, const int __R) { +__funline __m512i _mm512_maskz_cvtt_roundpd_epu64(__mmask8 __U, __m512d __A, + const int __R) { return (__m512i)__builtin_ia32_cvttpd2uqq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtt_roundps_epi64(__m256 __A, const int __R) { +__funline __m512i _mm512_cvtt_roundps_epi64(__m256 __A, const int __R) { return (__m512i)__builtin_ia32_cvttps2qq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtt_roundps_epi64(__m512i __W, __mmask8 __U, __m256 __A, - const int __R) { +__funline __m512i _mm512_mask_cvtt_roundps_epi64(__m512i __W, __mmask8 __U, + __m256 __A, const int __R) { return (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)__A, (__v8di)__W, (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtt_roundps_epi64(__mmask8 __U, __m256 __A, const int __R) { +__funline __m512i _mm512_maskz_cvtt_roundps_epi64(__mmask8 __U, __m256 __A, + const int __R) { return (__m512i)__builtin_ia32_cvttps2qq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtt_roundps_epu64(__m256 __A, const int __R) { +__funline __m512i _mm512_cvtt_roundps_epu64(__m256 __A, const int __R) { return (__m512i)__builtin_ia32_cvttps2uqq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtt_roundps_epu64(__m512i __W, __mmask8 __U, __m256 __A, - const int __R) { +__funline __m512i _mm512_mask_cvtt_roundps_epu64(__m512i __W, __mmask8 __U, + __m256 __A, const int __R) { return (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)__A, (__v8di)__W, (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtt_roundps_epu64(__mmask8 __U, __m256 __A, const int __R) { +__funline __m512i _mm512_maskz_cvtt_roundps_epu64(__mmask8 __U, __m256 __A, + const int __R) { return (__m512i)__builtin_ia32_cvttps2uqq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundpd_epi64(__m512d __A, const int __R) { +__funline __m512i _mm512_cvt_roundpd_epi64(__m512d __A, const int __R) { return (__m512i)__builtin_ia32_cvtpd2qq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundpd_epi64(__m512i __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m512i _mm512_mask_cvt_roundpd_epi64(__m512i __W, __mmask8 __U, + __m512d __A, const int __R) { return (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)__A, (__v8di)__W, (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundpd_epi64(__mmask8 __U, __m512d __A, const int __R) { +__funline __m512i _mm512_maskz_cvt_roundpd_epi64(__mmask8 __U, __m512d __A, + const int __R) { return (__m512i)__builtin_ia32_cvtpd2qq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundpd_epu64(__m512d __A, const int __R) { +__funline __m512i _mm512_cvt_roundpd_epu64(__m512d __A, const int __R) { return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundpd_epu64(__m512i __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m512i _mm512_mask_cvt_roundpd_epu64(__m512i __W, __mmask8 __U, + __m512d __A, const int __R) { return (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)__A, (__v8di)__W, (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundpd_epu64(__mmask8 __U, __m512d __A, const int __R) { +__funline __m512i _mm512_maskz_cvt_roundpd_epu64(__mmask8 __U, __m512d __A, + const int __R) { return (__m512i)__builtin_ia32_cvtpd2uqq512_mask( (__v8df)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundps_epi64(__m256 __A, const int __R) { +__funline __m512i _mm512_cvt_roundps_epi64(__m256 __A, const int __R) { return (__m512i)__builtin_ia32_cvtps2qq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundps_epi64(__m512i __W, __mmask8 __U, __m256 __A, - const int __R) { +__funline __m512i _mm512_mask_cvt_roundps_epi64(__m512i __W, __mmask8 __U, + __m256 __A, const int __R) { return (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)__A, (__v8di)__W, (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundps_epi64(__mmask8 __U, __m256 __A, const int __R) { +__funline __m512i _mm512_maskz_cvt_roundps_epi64(__mmask8 __U, __m256 __A, + const int __R) { return (__m512i)__builtin_ia32_cvtps2qq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundps_epu64(__m256 __A, const int __R) { +__funline __m512i _mm512_cvt_roundps_epu64(__m256 __A, const int __R) { return (__m512i)__builtin_ia32_cvtps2uqq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundps_epu64(__m512i __W, __mmask8 __U, __m256 __A, - const int __R) { +__funline __m512i _mm512_mask_cvt_roundps_epu64(__m512i __W, __mmask8 __U, + __m256 __A, const int __R) { return (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)__A, (__v8di)__W, (__mmask8)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundps_epu64(__mmask8 __U, __m256 __A, const int __R) { +__funline __m512i _mm512_maskz_cvt_roundps_epu64(__mmask8 __U, __m256 __A, + const int __R) { return (__m512i)__builtin_ia32_cvtps2uqq512_mask( (__v8sf)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U, __R); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundepi64_ps(__m512i __A, const int __R) { +__funline __m256 _mm512_cvt_roundepi64_ps(__m512i __A, const int __R) { return (__m256)__builtin_ia32_cvtqq2ps512_mask( (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, __R); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundepi64_ps(__m256 __W, __mmask8 __U, __m512i __A, - const int __R) { +__funline __m256 _mm512_mask_cvt_roundepi64_ps(__m256 __W, __mmask8 __U, + __m512i __A, const int __R) { return (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)__A, (__v8sf)__W, (__mmask8)__U, __R); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundepi64_ps(__mmask8 __U, __m512i __A, const int __R) { +__funline __m256 _mm512_maskz_cvt_roundepi64_ps(__mmask8 __U, __m512i __A, + const int __R) { return (__m256)__builtin_ia32_cvtqq2ps512_mask( (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, __R); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundepu64_ps(__m512i __A, const int __R) { +__funline __m256 _mm512_cvt_roundepu64_ps(__m512i __A, const int __R) { return (__m256)__builtin_ia32_cvtuqq2ps512_mask( (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, __R); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundepu64_ps(__m256 __W, __mmask8 __U, __m512i __A, - const int __R) { +__funline __m256 _mm512_mask_cvt_roundepu64_ps(__m256 __W, __mmask8 __U, + __m512i __A, const int __R) { return (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)__A, (__v8sf)__W, (__mmask8)__U, __R); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundepu64_ps(__mmask8 __U, __m512i __A, const int __R) { +__funline __m256 _mm512_maskz_cvt_roundepu64_ps(__mmask8 __U, __m512i __A, + const int __R) { return (__m256)__builtin_ia32_cvtuqq2ps512_mask( (__v8di)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundepi64_pd(__m512i __A, const int __R) { +__funline __m512d _mm512_cvt_roundepi64_pd(__m512i __A, const int __R) { return (__m512d)__builtin_ia32_cvtqq2pd512_mask( (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundepi64_pd(__m512d __W, __mmask8 __U, __m512i __A, - const int __R) { +__funline __m512d _mm512_mask_cvt_roundepi64_pd(__m512d __W, __mmask8 __U, + __m512i __A, const int __R) { return (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)__A, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundepi64_pd(__mmask8 __U, __m512i __A, const int __R) { +__funline __m512d _mm512_maskz_cvt_roundepi64_pd(__mmask8 __U, __m512i __A, + const int __R) { return (__m512d)__builtin_ia32_cvtqq2pd512_mask( (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundepu64_pd(__m512i __A, const int __R) { +__funline __m512d _mm512_cvt_roundepu64_pd(__m512i __A, const int __R) { return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundepu64_pd(__m512d __W, __mmask8 __U, __m512i __A, - const int __R) { +__funline __m512d _mm512_mask_cvt_roundepu64_pd(__m512d __W, __mmask8 __U, + __m512i __A, const int __R) { return (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)__A, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundepu64_pd(__mmask8 __U, __m512i __A, const int __R) { +__funline __m512d _mm512_maskz_cvt_roundepu64_pd(__mmask8 __U, __m512i __A, + const int __R) { return (__m512d)__builtin_ia32_cvtuqq2pd512_mask( (__v8di)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_pd(__m512d __A, int __B) { +__funline __m512d _mm512_reduce_pd(__m512d __A, int __B) { return (__m512d)__builtin_ia32_reducepd512_mask( (__v8df)__A, __B, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_pd(__m512d __W, __mmask8 __U, __m512d __A, int __B) { +__funline __m512d _mm512_mask_reduce_pd(__m512d __W, __mmask8 __U, __m512d __A, + int __B) { return (__m512d)__builtin_ia32_reducepd512_mask((__v8df)__A, __B, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_reduce_pd(__mmask8 __U, __m512d __A, int __B) { +__funline __m512d _mm512_maskz_reduce_pd(__mmask8 __U, __m512d __A, int __B) { return (__m512d)__builtin_ia32_reducepd512_mask( (__v8df)__A, __B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_ps(__m512 __A, int __B) { +__funline __m512 _mm512_reduce_ps(__m512 __A, int __B) { return (__m512)__builtin_ia32_reduceps512_mask( (__v16sf)__A, __B, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_ps(__m512 __W, __mmask16 __U, __m512 __A, int __B) { +__funline __m512 _mm512_mask_reduce_ps(__m512 __W, __mmask16 __U, __m512 __A, + int __B) { return (__m512)__builtin_ia32_reduceps512_mask((__v16sf)__A, __B, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_reduce_ps(__mmask16 __U, __m512 __A, int __B) { +__funline __m512 _mm512_maskz_reduce_ps(__mmask16 __U, __m512 __A, int __B) { return (__m512)__builtin_ia32_reduceps512_mask( (__v16sf)__A, __B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_extractf32x8_ps(__m512 __A, const int __imm) { +__funline __m256 _mm512_extractf32x8_ps(__m512 __A, const int __imm) { return (__m256)__builtin_ia32_extractf32x8_mask( (__v16sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A, - const int __imm) { +__funline __m256 _mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A, + const int __imm) { return (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)__A, __imm, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A, const int __imm) { +__funline __m256 _mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A, + const int __imm) { return (__m256)__builtin_ia32_extractf32x8_mask( (__v16sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_extractf64x2_pd(__m512d __A, const int __imm) { +__funline __m128d _mm512_extractf64x2_pd(__m512d __A, const int __imm) { return (__m128d)__builtin_ia32_extractf64x2_512_mask( (__v8df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A, - const int __imm) { +__funline __m128d _mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, + __m512d __A, const int __imm) { return (__m128d)__builtin_ia32_extractf64x2_512_mask( (__v8df)__A, __imm, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A, const int __imm) { +__funline __m128d _mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A, + const int __imm) { return (__m128d)__builtin_ia32_extractf64x2_512_mask( (__v8df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_extracti32x8_epi32(__m512i __A, const int __imm) { +__funline __m256i _mm512_extracti32x8_epi32(__m512i __A, const int __imm) { return (__m256i)__builtin_ia32_extracti32x8_mask( (__v16si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A, - const int __imm) { +__funline __m256i _mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, + __m512i __A, const int __imm) { return (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)__A, __imm, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A, const int __imm) { +__funline __m256i _mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A, + const int __imm) { return (__m256i)__builtin_ia32_extracti32x8_mask( (__v16si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_extracti64x2_epi64(__m512i __A, const int __imm) { +__funline __m128i _mm512_extracti64x2_epi64(__m512i __A, const int __imm) { return (__m128i)__builtin_ia32_extracti64x2_512_mask( (__v8di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A, - const int __imm) { +__funline __m128i _mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, + __m512i __A, const int __imm) { return (__m128i)__builtin_ia32_extracti64x2_512_mask( (__v8di)__A, __imm, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A, const int __imm) { +__funline __m128i _mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A, + const int __imm) { return (__m128i)__builtin_ia32_extracti64x2_512_mask( (__v8di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_range_round_pd(__m512d __A, __m512d __B, int __C, const int __R) { +__funline __m512d _mm512_range_round_pd(__m512d __A, __m512d __B, int __C, + const int __R) { return (__m512d)__builtin_ia32_rangepd512_mask((__v8df)__A, (__v8df)__B, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_range_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, int __C, const int __R) { +__funline __m512d _mm512_mask_range_round_pd(__m512d __W, __mmask8 __U, + __m512d __A, __m512d __B, int __C, + const int __R) { return (__m512d)__builtin_ia32_rangepd512_mask( (__v8df)__A, (__v8df)__B, __C, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_range_round_pd(__mmask8 __U, __m512d __A, __m512d __B, int __C, - const int __R) { +__funline __m512d _mm512_maskz_range_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, int __C, + const int __R) { return (__m512d)__builtin_ia32_rangepd512_mask((__v8df)__A, (__v8df)__B, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_range_round_ps(__m512 __A, __m512 __B, int __C, const int __R) { +__funline __m512 _mm512_range_round_ps(__m512 __A, __m512 __B, int __C, + const int __R) { return (__m512)__builtin_ia32_rangeps512_mask((__v16sf)__A, (__v16sf)__B, __C, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_range_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, int __C, const int __R) { +__funline __m512 _mm512_mask_range_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, int __C, const int __R) { return (__m512)__builtin_ia32_rangeps512_mask( (__v16sf)__A, (__v16sf)__B, __C, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_range_round_ps(__mmask16 __U, __m512 __A, __m512 __B, int __C, - const int __R) { +__funline __m512 _mm512_maskz_range_round_ps(__mmask16 __U, __m512 __A, + __m512 __B, int __C, const int __R) { return (__m512)__builtin_ia32_rangeps512_mask((__v16sf)__A, (__v16sf)__B, __C, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_inserti32x8(__m512i __A, __m256i __B, const int __imm) { +__funline __m512i _mm512_inserti32x8(__m512i __A, __m256i __B, const int __imm) { return (__m512i)__builtin_ia32_inserti32x8_mask( (__v16si)__A, (__v8si)__B, __imm, (__v16si)_mm512_setzero_si512(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_inserti32x8(__m512i __W, __mmask16 __U, __m512i __A, - __m256i __B, const int __imm) { +__funline __m512i _mm512_mask_inserti32x8(__m512i __W, __mmask16 __U, __m512i __A, + __m256i __B, const int __imm) { return (__m512i)__builtin_ia32_inserti32x8_mask( (__v16si)__A, (__v8si)__B, __imm, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_inserti32x8(__mmask16 __U, __m512i __A, __m256i __B, - const int __imm) { +__funline __m512i _mm512_maskz_inserti32x8(__mmask16 __U, __m512i __A, + __m256i __B, const int __imm) { return (__m512i)__builtin_ia32_inserti32x8_mask( (__v16si)__A, (__v8si)__B, __imm, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_insertf32x8(__m512 __A, __m256 __B, const int __imm) { +__funline __m512 _mm512_insertf32x8(__m512 __A, __m256 __B, const int __imm) { return (__m512)__builtin_ia32_insertf32x8_mask( (__v16sf)__A, (__v8sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_insertf32x8(__m512 __W, __mmask16 __U, __m512 __A, __m256 __B, - const int __imm) { +__funline __m512 _mm512_mask_insertf32x8(__m512 __W, __mmask16 __U, __m512 __A, + __m256 __B, const int __imm) { return (__m512)__builtin_ia32_insertf32x8_mask( (__v16sf)__A, (__v8sf)__B, __imm, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_insertf32x8(__mmask16 __U, __m512 __A, __m256 __B, - const int __imm) { +__funline __m512 _mm512_maskz_insertf32x8(__mmask16 __U, __m512 __A, __m256 __B, + const int __imm) { return (__m512)__builtin_ia32_insertf32x8_mask( (__v16sf)__A, (__v8sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_inserti64x2(__m512i __A, __m128i __B, const int __imm) { +__funline __m512i _mm512_inserti64x2(__m512i __A, __m128i __B, const int __imm) { return (__m512i)__builtin_ia32_inserti64x2_512_mask( (__v8di)__A, (__v2di)__B, __imm, (__v8di)_mm512_setzero_si512(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_inserti64x2(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B, - const int __imm) { +__funline __m512i _mm512_mask_inserti64x2(__m512i __W, __mmask8 __U, __m512i __A, + __m128i __B, const int __imm) { return (__m512i)__builtin_ia32_inserti64x2_512_mask( (__v8di)__A, (__v2di)__B, __imm, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_inserti64x2(__mmask8 __U, __m512i __A, __m128i __B, - const int __imm) { +__funline __m512i _mm512_maskz_inserti64x2(__mmask8 __U, __m512i __A, __m128i __B, + const int __imm) { return (__m512i)__builtin_ia32_inserti64x2_512_mask( (__v8di)__A, (__v2di)__B, __imm, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_insertf64x2(__m512d __A, __m128d __B, const int __imm) { +__funline __m512d _mm512_insertf64x2(__m512d __A, __m128d __B, const int __imm) { return (__m512d)__builtin_ia32_insertf64x2_512_mask( (__v8df)__A, (__v2df)__B, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_insertf64x2(__m512d __W, __mmask8 __U, __m512d __A, __m128d __B, - const int __imm) { +__funline __m512d _mm512_mask_insertf64x2(__m512d __W, __mmask8 __U, __m512d __A, + __m128d __B, const int __imm) { return (__m512d)__builtin_ia32_insertf64x2_512_mask( (__v8df)__A, (__v2df)__B, __imm, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_insertf64x2(__mmask8 __U, __m512d __A, __m128d __B, - const int __imm) { +__funline __m512d _mm512_maskz_insertf64x2(__mmask8 __U, __m512d __A, __m128d __B, + const int __imm) { return (__m512d)__builtin_ia32_insertf64x2_512_mask( (__v8df)__A, (__v2df)__B, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fpclass_pd_mask(__mmask8 __U, __m512d __A, const int __imm) { +__funline __mmask8 _mm512_mask_fpclass_pd_mask(__mmask8 __U, __m512d __A, + const int __imm) { return (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)__A, __imm, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fpclass_pd_mask(__m512d __A, const int __imm) { +__funline __mmask8 _mm512_fpclass_pd_mask(__m512d __A, const int __imm) { return (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)__A, __imm, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fpclass_ps_mask(__mmask16 __U, __m512 __A, const int __imm) { +__funline __mmask16 _mm512_mask_fpclass_ps_mask(__mmask16 __U, __m512 __A, + const int __imm) { return (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)__A, __imm, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fpclass_ps_mask(__m512 __A, const int __imm) { +__funline __mmask16 _mm512_fpclass_ps_mask(__m512 __A, const int __imm) { return (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)__A, __imm, (__mmask16)-1); } diff --git a/third_party/intel/avx512erintrin.internal.h b/third_party/intel/avx512erintrin.internal.h index e50df746c..d59af79bb 100644 --- a/third_party/intel/avx512erintrin.internal.h +++ b/third_party/intel/avx512erintrin.internal.h @@ -21,159 +21,126 @@ typedef unsigned char __mmask8; typedef unsigned short __mmask16; #ifdef __OPTIMIZE__ -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_exp2a23_round_pd(__m512d __A, int __R) { +__funline __m512d _mm512_exp2a23_round_pd(__m512d __A, int __R) { __m512d __W; return (__m512d)__builtin_ia32_exp2pd_mask((__v8df)__A, (__v8df)__W, (__mmask8)-1, __R); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_exp2a23_round_pd(__m512d __W, __mmask8 __U, __m512d __A, int __R) { +__funline __m512d _mm512_mask_exp2a23_round_pd(__m512d __W, __mmask8 __U, + __m512d __A, int __R) { return (__m512d)__builtin_ia32_exp2pd_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_exp2a23_round_pd(__mmask8 __U, __m512d __A, int __R) { +__funline __m512d _mm512_maskz_exp2a23_round_pd(__mmask8 __U, __m512d __A, + int __R) { return (__m512d)__builtin_ia32_exp2pd_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_exp2a23_round_ps(__m512 __A, int __R) { +__funline __m512 _mm512_exp2a23_round_ps(__m512 __A, int __R) { __m512 __W; return (__m512)__builtin_ia32_exp2ps_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)-1, __R); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_exp2a23_round_ps(__m512 __W, __mmask16 __U, __m512 __A, int __R) { +__funline __m512 _mm512_mask_exp2a23_round_ps(__m512 __W, __mmask16 __U, + __m512 __A, int __R) { return (__m512)__builtin_ia32_exp2ps_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_exp2a23_round_ps(__mmask16 __U, __m512 __A, int __R) { +__funline __m512 _mm512_maskz_exp2a23_round_ps(__mmask16 __U, __m512 __A, + int __R) { return (__m512)__builtin_ia32_exp2ps_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rcp28_round_pd(__m512d __A, int __R) { +__funline __m512d _mm512_rcp28_round_pd(__m512d __A, int __R) { __m512d __W; return (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)__A, (__v8df)__W, (__mmask8)-1, __R); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_rcp28_round_pd(__m512d __W, __mmask8 __U, __m512d __A, int __R) { +__funline __m512d _mm512_mask_rcp28_round_pd(__m512d __W, __mmask8 __U, + __m512d __A, int __R) { return (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rcp28_round_pd(__mmask8 __U, __m512d __A, int __R) { +__funline __m512d _mm512_maskz_rcp28_round_pd(__mmask8 __U, __m512d __A, + int __R) { return (__m512d)__builtin_ia32_rcp28pd_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rcp28_round_ps(__m512 __A, int __R) { +__funline __m512 _mm512_rcp28_round_ps(__m512 __A, int __R) { __m512 __W; return (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_rcp28_round_ps(__m512 __W, __mmask16 __U, __m512 __A, int __R) { +__funline __m512 _mm512_mask_rcp28_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + int __R) { return (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rcp28_round_ps(__mmask16 __U, __m512 __A, int __R) { +__funline __m512 _mm512_maskz_rcp28_round_ps(__mmask16 __U, __m512 __A, int __R) { return (__m512)__builtin_ia32_rcp28ps_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rcp28_round_sd(__m128d __A, __m128d __B, int __R) { +__funline __m128d _mm_rcp28_round_sd(__m128d __A, __m128d __B, int __R) { return (__m128d)__builtin_ia32_rcp28sd_round((__v2df)__B, (__v2df)__A, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rcp28_round_ss(__m128 __A, __m128 __B, int __R) { +__funline __m128 _mm_rcp28_round_ss(__m128 __A, __m128 __B, int __R) { return (__m128)__builtin_ia32_rcp28ss_round((__v4sf)__B, (__v4sf)__A, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rsqrt28_round_pd(__m512d __A, int __R) { +__funline __m512d _mm512_rsqrt28_round_pd(__m512d __A, int __R) { __m512d __W; return (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)__A, (__v8df)__W, (__mmask8)-1, __R); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_rsqrt28_round_pd(__m512d __W, __mmask8 __U, __m512d __A, int __R) { +__funline __m512d _mm512_mask_rsqrt28_round_pd(__m512d __W, __mmask8 __U, + __m512d __A, int __R) { return (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rsqrt28_round_pd(__mmask8 __U, __m512d __A, int __R) { +__funline __m512d _mm512_maskz_rsqrt28_round_pd(__mmask8 __U, __m512d __A, + int __R) { return (__m512d)__builtin_ia32_rsqrt28pd_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rsqrt28_round_ps(__m512 __A, int __R) { +__funline __m512 _mm512_rsqrt28_round_ps(__m512 __A, int __R) { __m512 __W; return (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)-1, __R); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_rsqrt28_round_ps(__m512 __W, __mmask16 __U, __m512 __A, int __R) { +__funline __m512 _mm512_mask_rsqrt28_round_ps(__m512 __W, __mmask16 __U, + __m512 __A, int __R) { return (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rsqrt28_round_ps(__mmask16 __U, __m512 __A, int __R) { +__funline __m512 _mm512_maskz_rsqrt28_round_ps(__mmask16 __U, __m512 __A, + int __R) { return (__m512)__builtin_ia32_rsqrt28ps_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rsqrt28_round_sd(__m128d __A, __m128d __B, int __R) { +__funline __m128d _mm_rsqrt28_round_sd(__m128d __A, __m128d __B, int __R) { return (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)__B, (__v2df)__A, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rsqrt28_round_ss(__m128 __A, __m128 __B, int __R) { +__funline __m128 _mm_rsqrt28_round_ss(__m128 __A, __m128 __B, int __R) { return (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)__B, (__v4sf)__A, __R); } diff --git a/third_party/intel/avx512fintrin.internal.h b/third_party/intel/avx512fintrin.internal.h index f7d7eeeb5..d959242ec 100644 --- a/third_party/intel/avx512fintrin.internal.h +++ b/third_party/intel/avx512fintrin.internal.h @@ -36,46 +36,37 @@ typedef double __m512d_u typedef unsigned char __mmask8; typedef unsigned short __mmask16; -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_int2mask(int __M) { +__funline __mmask16 _mm512_int2mask(int __M) { return (__mmask16)__M; } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask2int(__mmask16 __M) { +__funline int _mm512_mask2int(__mmask16 __M) { return (int)__M; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set_epi64(long long __A, long long __B, long long __C, long long __D, - long long __E, long long __F, long long __G, - long long __H) { +__funline __m512i _mm512_set_epi64(long long __A, long long __B, long long __C, + long long __D, long long __E, long long __F, + long long __G, long long __H) { return __extension__(__m512i)(__v8di){__H, __G, __F, __E, __D, __C, __B, __A}; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set_epi32(int __A, int __B, int __C, int __D, int __E, int __F, - int __G, int __H, int __I, int __J, int __K, int __L, - int __M, int __N, int __O, int __P) { +__funline __m512i _mm512_set_epi32(int __A, int __B, int __C, int __D, int __E, + int __F, int __G, int __H, int __I, int __J, + int __K, int __L, int __M, int __N, int __O, + int __P) { return __extension__(__m512i)(__v16si){__P, __O, __N, __M, __L, __K, __J, __I, __H, __G, __F, __E, __D, __C, __B, __A}; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set_epi16(short __q31, short __q30, short __q29, short __q28, - short __q27, short __q26, short __q25, short __q24, - short __q23, short __q22, short __q21, short __q20, - short __q19, short __q18, short __q17, short __q16, - short __q15, short __q14, short __q13, short __q12, - short __q11, short __q10, short __q09, short __q08, - short __q07, short __q06, short __q05, short __q04, - short __q03, short __q02, short __q01, short __q00) { +__funline __m512i _mm512_set_epi16( + short __q31, short __q30, short __q29, short __q28, short __q27, + short __q26, short __q25, short __q24, short __q23, short __q22, + short __q21, short __q20, short __q19, short __q18, short __q17, + short __q16, short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, short __q07, + short __q06, short __q05, short __q04, short __q03, short __q02, + short __q01, short __q00) { return __extension__(__m512i)(__v32hi){ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, @@ -83,21 +74,18 @@ extern __inline __m512i __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31}; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60, char __q59, - char __q58, char __q57, char __q56, char __q55, char __q54, - char __q53, char __q52, char __q51, char __q50, char __q49, - char __q48, char __q47, char __q46, char __q45, char __q44, - char __q43, char __q42, char __q41, char __q40, char __q39, - char __q38, char __q37, char __q36, char __q35, char __q34, - char __q33, char __q32, char __q31, char __q30, char __q29, - char __q28, char __q27, char __q26, char __q25, char __q24, - char __q23, char __q22, char __q21, char __q20, char __q19, - char __q18, char __q17, char __q16, char __q15, char __q14, - char __q13, char __q12, char __q11, char __q10, char __q09, - char __q08, char __q07, char __q06, char __q05, char __q04, - char __q03, char __q02, char __q01, char __q00) { +__funline __m512i _mm512_set_epi8( + char __q63, char __q62, char __q61, char __q60, char __q59, char __q58, + char __q57, char __q56, char __q55, char __q54, char __q53, char __q52, + char __q51, char __q50, char __q49, char __q48, char __q47, char __q46, + char __q45, char __q44, char __q43, char __q42, char __q41, char __q40, + char __q39, char __q38, char __q37, char __q36, char __q35, char __q34, + char __q33, char __q32, char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, char __q23, char __q22, + char __q21, char __q20, char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, char __q11, char __q10, + char __q09, char __q08, char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) { return __extension__(__m512i)(__v64qi){ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, __q16, __q17, __q18, __q19, @@ -108,19 +96,15 @@ extern __inline __m512i __q60, __q61, __q62, __q63}; } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set_pd(double __A, double __B, double __C, double __D, double __E, - double __F, double __G, double __H) { +__funline __m512d _mm512_set_pd(double __A, double __B, double __C, double __D, + double __E, double __F, double __G, double __H) { return __extension__(__m512d){__H, __G, __F, __E, __D, __C, __B, __A}; } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set_ps(float __A, float __B, float __C, float __D, float __E, - float __F, float __G, float __H, float __I, float __J, - float __K, float __L, float __M, float __N, float __O, - float __P) { +__funline __m512 _mm512_set_ps(float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H, + float __I, float __J, float __K, float __L, + float __M, float __N, float __O, float __P) { return __extension__(__m512){__P, __O, __N, __M, __L, __K, __J, __I, __H, __G, __F, __E, __D, __C, __B, __A}; } @@ -141,34 +125,26 @@ extern __inline __m512 _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, \ e1, e0) -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_undefined_ps(void) { +__funline __m512 _mm512_undefined_ps(void) { __m512 __Y = __Y; return __Y; } #define _mm512_undefined _mm512_undefined_ps -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_undefined_pd(void) { +__funline __m512d _mm512_undefined_pd(void) { __m512d __Y = __Y; return __Y; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_undefined_epi32(void) { +__funline __m512i _mm512_undefined_epi32(void) { __m512i __Y = __Y; return __Y; } #define _mm512_undefined_si512 _mm512_undefined_epi32 -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set1_epi8(char __A) { +__funline __m512i _mm512_set1_epi8(char __A) { return __extension__(__m512i)(__v64qi){ __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, @@ -177,18 +153,14 @@ extern __inline __m512i __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A}; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set1_epi16(short __A) { +__funline __m512i _mm512_set1_epi16(short __A) { return __extension__(__m512i)(__v32hi){ __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A}; } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set1_pd(double __A) { +__funline __m512d _mm512_set1_pd(double __A) { return (__m512d)__builtin_ia32_broadcastsd512( __extension__(__v2df){ __A, @@ -196,9 +168,7 @@ extern __inline __m512d (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set1_ps(float __A) { +__funline __m512 _mm512_set1_ps(float __A) { return (__m512)__builtin_ia32_broadcastss512( __extension__(__v4sf){ __A, @@ -206,29 +176,22 @@ extern __inline __m512 (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set4_epi32(int __A, int __B, int __C, int __D) { +__funline __m512i _mm512_set4_epi32(int __A, int __B, int __C, int __D) { return __extension__(__m512i)(__v16si){__D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A}; } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_set4_epi64(long long __A, long long __B, long long __C, long long __D) { +__funline __m512i _mm512_set4_epi64(long long __A, long long __B, long long __C, + long long __D) { return __extension__(__m512i)(__v8di){__D, __C, __B, __A, __D, __C, __B, __A}; } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set4_pd(double __A, double __B, double __C, double __D) { +__funline __m512d _mm512_set4_pd(double __A, double __B, double __C, double __D) { return __extension__(__m512d){__D, __C, __B, __A, __D, __C, __B, __A}; } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set4_ps(float __A, float __B, float __C, float __D) { +__funline __m512 _mm512_set4_ps(float __A, float __B, float __C, float __D) { return __extension__(__m512){__D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A}; } @@ -241,544 +204,413 @@ extern __inline __m512 #define _mm512_setr4_ps(e0, e1, e2, e3) _mm512_set4_ps(e3, e2, e1, e0) -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_setzero_ps(void) { +__funline __m512 _mm512_setzero_ps(void) { return __extension__(__m512){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_setzero(void) { +__funline __m512 _mm512_setzero(void) { return _mm512_setzero_ps(); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_setzero_pd(void) { +__funline __m512d _mm512_setzero_pd(void) { return __extension__(__m512d){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_setzero_epi32(void) { +__funline __m512i _mm512_setzero_epi32(void) { return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0}; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_setzero_si512(void) { +__funline __m512i _mm512_setzero_si512(void) { return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0}; } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_movapd512_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mov_pd(__mmask8 __U, __m512d __A) { +__funline __m512d _mm512_maskz_mov_pd(__mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_movapd512_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mov_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_mov_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_movaps512_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) { +__funline __m512 _mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_movaps512_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_load_pd(void const *__P) { +__funline __m512d _mm512_load_pd(void const *__P) { return *(__m512d *)__P; } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_load_pd(__m512d __W, __mmask8 __U, void const *__P) { +__funline __m512d _mm512_mask_load_pd(__m512d __W, __mmask8 __U, + void const *__P) { return (__m512d)__builtin_ia32_loadapd512_mask((const __v8df *)__P, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_load_pd(__mmask8 __U, void const *__P) { +__funline __m512d _mm512_maskz_load_pd(__mmask8 __U, void const *__P) { return (__m512d)__builtin_ia32_loadapd512_mask( (const __v8df *)__P, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_store_pd(void *__P, __m512d __A) { +__funline void _mm512_store_pd(void *__P, __m512d __A) { *(__m512d *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) { +__funline void _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) { __builtin_ia32_storeapd512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_load_ps(void const *__P) { +__funline __m512 _mm512_load_ps(void const *__P) { return *(__m512 *)__P; } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_load_ps(__m512 __W, __mmask16 __U, void const *__P) { +__funline __m512 _mm512_mask_load_ps(__m512 __W, __mmask16 __U, void const *__P) { return (__m512)__builtin_ia32_loadaps512_mask((const __v16sf *)__P, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_load_ps(__mmask16 __U, void const *__P) { +__funline __m512 _mm512_maskz_load_ps(__mmask16 __U, void const *__P) { return (__m512)__builtin_ia32_loadaps512_mask( (const __v16sf *)__P, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_store_ps(void *__P, __m512 __A) { +__funline void _mm512_store_ps(void *__P, __m512 __A) { *(__m512 *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) { +__funline void _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) { __builtin_ia32_storeaps512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mov_epi64(__m512i __W, __mmask8 __U, __m512i __A) { +__funline __m512i _mm512_mask_mov_epi64(__m512i __W, __mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_movdqa64_512_mask((__v8di)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mov_epi64(__mmask8 __U, __m512i __A) { +__funline __m512i _mm512_maskz_mov_epi64(__mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_movdqa64_512_mask( (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_load_epi64(void const *__P) { +__funline __m512i _mm512_load_epi64(void const *__P) { return *(__m512i *)__P; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_load_epi64(__m512i __W, __mmask8 __U, void const *__P) { +__funline __m512i _mm512_mask_load_epi64(__m512i __W, __mmask8 __U, + void const *__P) { return (__m512i)__builtin_ia32_movdqa64load512_mask( (const __v8di *)__P, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_load_epi64(__mmask8 __U, void const *__P) { +__funline __m512i _mm512_maskz_load_epi64(__mmask8 __U, void const *__P) { return (__m512i)__builtin_ia32_movdqa64load512_mask( (const __v8di *)__P, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_store_epi64(void *__P, __m512i __A) { +__funline void _mm512_store_epi64(void *__P, __m512i __A) { *(__m512i *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_store_epi64(void *__P, __mmask8 __U, __m512i __A) { +__funline void _mm512_mask_store_epi64(void *__P, __mmask8 __U, __m512i __A) { __builtin_ia32_movdqa64store512_mask((__v8di *)__P, (__v8di)__A, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mov_epi32(__m512i __W, __mmask16 __U, __m512i __A) { +__funline __m512i _mm512_mask_mov_epi32(__m512i __W, __mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_movdqa32_512_mask((__v16si)__A, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mov_epi32(__mmask16 __U, __m512i __A) { +__funline __m512i _mm512_maskz_mov_epi32(__mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_movdqa32_512_mask( (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_load_si512(void const *__P) { +__funline __m512i _mm512_load_si512(void const *__P) { return *(__m512i *)__P; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_load_epi32(void const *__P) { +__funline __m512i _mm512_load_epi32(void const *__P) { return *(__m512i *)__P; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_load_epi32(__m512i __W, __mmask16 __U, void const *__P) { +__funline __m512i _mm512_mask_load_epi32(__m512i __W, __mmask16 __U, + void const *__P) { return (__m512i)__builtin_ia32_movdqa32load512_mask( (const __v16si *)__P, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_load_epi32(__mmask16 __U, void const *__P) { +__funline __m512i _mm512_maskz_load_epi32(__mmask16 __U, void const *__P) { return (__m512i)__builtin_ia32_movdqa32load512_mask( (const __v16si *)__P, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_store_si512(void *__P, __m512i __A) { +__funline void _mm512_store_si512(void *__P, __m512i __A) { *(__m512i *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_store_epi32(void *__P, __m512i __A) { +__funline void _mm512_store_epi32(void *__P, __m512i __A) { *(__m512i *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_store_epi32(void *__P, __mmask16 __U, __m512i __A) { +__funline void _mm512_mask_store_epi32(void *__P, __mmask16 __U, __m512i __A) { __builtin_ia32_movdqa32store512_mask((__v16si *)__P, (__v16si)__A, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mullo_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_mullo_epi32(__m512i __A, __m512i __B) { return (__m512i)((__v16su)__A * (__v16su)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmulld512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmulld512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mullox_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_mullox_epi64(__m512i __A, __m512i __B) { return (__m512i)((__v8du)__A * (__v8du)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_mullox_epi64(__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __M, __m512i __A, + __m512i __B) { return _mm512_mask_mov_epi64(__W, __M, _mm512_mullox_epi64(__A, __B)); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sllv_epi32(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_sllv_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psllv16si_mask( (__v16si)__X, (__v16si)__Y, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psllv16si_mask((__v16si)__X, (__v16si)__Y, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psllv16si_mask((__v16si)__X, (__v16si)__Y, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srav_epi32(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_srav_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psrav16si_mask( (__v16si)__X, (__v16si)__Y, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psrav16si_mask((__v16si)__X, (__v16si)__Y, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psrav16si_mask((__v16si)__X, (__v16si)__Y, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srlv_epi32(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_srlv_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psrlv16si_mask( (__v16si)__X, (__v16si)__Y, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psrlv16si_mask((__v16si)__X, (__v16si)__Y, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psrlv16si_mask((__v16si)__X, (__v16si)__Y, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_add_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_add_epi64(__m512i __A, __m512i __B) { return (__m512i)((__v8du)__A + (__v8du)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_paddq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sub_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_sub_epi64(__m512i __A, __m512i __B) { return (__m512i)((__v8du)__A - (__v8du)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_psubq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sllv_epi64(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_sllv_epi64(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psllv8di_mask( (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psllv8di_mask((__v8di)__X, (__v8di)__Y, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psllv8di_mask( (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srav_epi64(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_srav_epi64(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psrav8di_mask( (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psrav8di_mask((__v8di)__X, (__v8di)__Y, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psrav8di_mask( (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srlv_epi64(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_srlv_epi64(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psrlv8di_mask( (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psrlv8di_mask((__v8di)__X, (__v8di)__Y, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_psrlv8di_mask( (__v8di)__X, (__v8di)__Y, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_add_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_add_epi32(__m512i __A, __m512i __B) { return (__m512i)((__v16su)__A + (__v16su)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddd512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_add_epi32(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_add_epi32(__mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_paddd512_mask((__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mul_epi32(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_mul_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuldq512_mask( (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_pmuldq512_mask((__v16si)__X, (__v16si)__Y, (__v8di)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuldq512_mask( (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sub_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_sub_epi32(__m512i __A, __m512i __B) { return (__m512i)((__v16su)__A - (__v16su)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubd512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_psubd512_mask((__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mul_epu32(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_mul_epu32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuludq512_mask( (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_pmuludq512_mask((__v16si)__X, (__v16si)__Y, (__v8di)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuludq512_mask( (__v16si)__X, (__v16si)__Y, (__v8di)_mm512_setzero_si512(), __M); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_slli_epi64(__m512i __A, unsigned int __B) { +__funline __m512i _mm512_slli_epi64(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psllqi512_mask( (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, - unsigned int __B) { +__funline __m512i _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_psllqi512_mask((__v8di)__A, __B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { +__funline __m512i _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_psllqi512_mask( (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } @@ -798,46 +630,36 @@ extern __inline __m512i (__mmask8)(U))) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sll_epi64(__m512i __A, __m128i __B) { +__funline __m512i _mm512_sll_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psllq512_mask( (__v8di)__A, (__v2di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psllq512_mask((__v8di)__A, (__v2di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psllq512_mask( (__v8di)__A, (__v2di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srli_epi64(__m512i __A, unsigned int __B) { +__funline __m512i _mm512_srli_epi64(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psrlqi512_mask( (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, - unsigned int __B) { +__funline __m512i _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_psrlqi512_mask((__v8di)__A, __B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { +__funline __m512i _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_psrlqi512_mask( (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } @@ -857,46 +679,36 @@ extern __inline __m512i (__mmask8)(U))) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srl_epi64(__m512i __A, __m128i __B) { +__funline __m512i _mm512_srl_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrlq512_mask( (__v8di)__A, (__v2di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psrlq512_mask((__v8di)__A, (__v2di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrlq512_mask( (__v8di)__A, (__v2di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srai_epi64(__m512i __A, unsigned int __B) { +__funline __m512i _mm512_srai_epi64(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psraqi512_mask( (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, - unsigned int __B) { +__funline __m512i _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_psraqi512_mask((__v8di)__A, __B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) { +__funline __m512i _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_psraqi512_mask( (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } @@ -916,46 +728,36 @@ extern __inline __m512i (__mmask8)(U))) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sra_epi64(__m512i __A, __m128i __B) { +__funline __m512i _mm512_sra_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psraq512_mask( (__v8di)__A, (__v2di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psraq512_mask((__v8di)__A, (__v2di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psraq512_mask( (__v8di)__A, (__v2di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_slli_epi32(__m512i __A, unsigned int __B) { +__funline __m512i _mm512_slli_epi32(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_pslldi512_mask( (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) { +__funline __m512i _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_pslldi512_mask((__v16si)__A, __B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { +__funline __m512i _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_pslldi512_mask( (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } @@ -975,48 +777,39 @@ extern __inline __m512i (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sll_epi32(__m512i __A, __m128i __B) { +__funline __m512i _mm512_sll_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_pslld512_mask( (__v16si)__A, (__v4si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_pslld512_mask((__v16si)__A, (__v4si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_pslld512_mask((__v16si)__A, (__v4si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srli_epi32(__m512i __A, unsigned int __B) { +__funline __m512i _mm512_srli_epi32(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psrldi512_mask( (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) { +__funline __m512i _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_psrldi512_mask((__v16si)__A, __B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { +__funline __m512i _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_psrldi512_mask( (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } @@ -1036,48 +829,39 @@ extern __inline __m512i (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srl_epi32(__m512i __A, __m128i __B) { +__funline __m512i _mm512_srl_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrld512_mask( (__v16si)__A, (__v4si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psrld512_mask((__v16si)__A, (__v4si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psrld512_mask((__v16si)__A, (__v4si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_srai_epi32(__m512i __A, unsigned int __B) { +__funline __m512i _mm512_srai_epi32(__m512i __A, unsigned int __B) { return (__m512i)__builtin_ia32_psradi512_mask( (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) { +__funline __m512i _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_psradi512_mask((__v16si)__A, __B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { +__funline __m512i _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, + unsigned int __B) { return (__m512i)__builtin_ia32_psradi512_mask( (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } @@ -1097,110 +881,86 @@ extern __inline __m512i (__v16si)(__m512i)_mm512_setzero_si512(), (__mmask16)(U))) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sra_epi32(__m512i __A, __m128i __B) { +__funline __m512i _mm512_sra_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrad512_mask( (__v16si)__A, (__v4si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psrad512_mask((__v16si)__A, (__v4si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) { +__funline __m512i _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, + __m128i __B) { return (__m512i)__builtin_ia32_psrad512_mask((__v16si)__A, (__v4si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_round_sd(__m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_add_round_sd(__m128d __A, __m128d __B, const int __R) { return (__m128d)__builtin_ia32_addsd_round((__v2df)__A, (__v2df)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_add_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_mask_add_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_addsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_add_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_maskz_add_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_addsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_round_ss(__m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_add_round_ss(__m128 __A, __m128 __B, const int __R) { return (__m128)__builtin_ia32_addss_round((__v4sf)__A, (__v4sf)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_add_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_add_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_addss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_add_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_maskz_add_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_addss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_round_sd(__m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_sub_round_sd(__m128d __A, __m128d __B, const int __R) { return (__m128d)__builtin_ia32_subsd_round((__v2df)__A, (__v2df)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_mask_sub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_subsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_sub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_maskz_sub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_subsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_round_ss(__m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_sub_round_ss(__m128 __A, __m128 __B, const int __R) { return (__m128)__builtin_ia32_subss_round((__v4sf)__A, (__v4sf)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_sub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_subss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_sub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_maskz_sub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_subss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); } @@ -1243,50 +1003,42 @@ _mm_maskz_sub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { #endif #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_ternarylogic_epi64(__m512i __A, __m512i __B, __m512i __C, - const int __imm) { +__funline __m512i _mm512_ternarylogic_epi64(__m512i __A, __m512i __B, __m512i __C, + const int __imm) { return (__m512i)__builtin_ia32_pternlogq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)__C, __imm, (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, __m512i __B, - __m512i __C, const int __imm) { +__funline __m512i _mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, + __m512i __B, __m512i __C, + const int __imm) { return (__m512i)__builtin_ia32_pternlogq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)__C, __imm, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, __m512i __B, - __m512i __C, const int __imm) { +__funline __m512i _mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, + __m512i __B, __m512i __C, + const int __imm) { return (__m512i)__builtin_ia32_pternlogq512_maskz( (__v8di)__A, (__v8di)__B, (__v8di)__C, __imm, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_ternarylogic_epi32(__m512i __A, __m512i __B, __m512i __C, - const int __imm) { +__funline __m512i _mm512_ternarylogic_epi32(__m512i __A, __m512i __B, __m512i __C, + const int __imm) { return (__m512i)__builtin_ia32_pternlogd512_mask( (__v16si)__A, (__v16si)__B, (__v16si)__C, __imm, (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, __m512i __B, - __m512i __C, const int __imm) { +__funline __m512i _mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, + __m512i __B, __m512i __C, + const int __imm) { return (__m512i)__builtin_ia32_pternlogd512_mask( (__v16si)__A, (__v16si)__B, (__v16si)__C, __imm, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, __m512i __B, - __m512i __C, const int __imm) { +__funline __m512i _mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, + __m512i __B, __m512i __C, + const int __imm) { return (__m512i)__builtin_ia32_pternlogd512_maskz( (__v16si)__A, (__v16si)__B, (__v16si)__C, __imm, (__mmask16)__U); } @@ -1317,255 +1069,191 @@ extern __inline __m512i (int)(I), (__mmask16)(U))) #endif -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rcp14_pd(__m512d __A) { +__funline __m512d _mm512_rcp14_pd(__m512d __A) { return (__m512d)__builtin_ia32_rcp14pd512_mask( (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_rcp14_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_rcp14_pd(__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_rcp14pd512_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rcp14_pd(__mmask8 __U, __m512d __A) { +__funline __m512d _mm512_maskz_rcp14_pd(__mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_rcp14pd512_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rcp14_ps(__m512 __A) { +__funline __m512 _mm512_rcp14_ps(__m512 __A) { return (__m512)__builtin_ia32_rcp14ps512_mask( (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_rcp14_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_rcp14_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_rcp14ps512_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rcp14_ps(__mmask16 __U, __m512 __A) { +__funline __m512 _mm512_maskz_rcp14_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_rcp14ps512_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rcp14_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_rcp14_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_rcp14sd((__v2df)__B, (__v2df)__A); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rcp14_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_rcp14_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_rcp14sd_mask((__v2df)__B, (__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rcp14_sd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_rcp14_sd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_rcp14sd_mask( (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rcp14_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_rcp14_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_rcp14ss((__v4sf)__B, (__v4sf)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rcp14_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_rcp14_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_rcp14ss_mask((__v4sf)__B, (__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rcp14_ss(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_rcp14_ss(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_rcp14ss_mask( (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rsqrt14_pd(__m512d __A) { +__funline __m512d _mm512_rsqrt14_pd(__m512d __A) { return (__m512d)__builtin_ia32_rsqrt14pd512_mask( (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_rsqrt14_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_rsqrt14_pd(__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_rsqrt14pd512_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rsqrt14_pd(__mmask8 __U, __m512d __A) { +__funline __m512d _mm512_maskz_rsqrt14_pd(__mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_rsqrt14pd512_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rsqrt14_ps(__m512 __A) { +__funline __m512 _mm512_rsqrt14_ps(__m512 __A) { return (__m512)__builtin_ia32_rsqrt14ps512_mask( (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_rsqrt14_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_rsqrt14_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_rsqrt14ps512_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rsqrt14_ps(__mmask16 __U, __m512 __A) { +__funline __m512 _mm512_maskz_rsqrt14_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_rsqrt14ps512_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rsqrt14_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_rsqrt14_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_rsqrt14sd((__v2df)__B, (__v2df)__A); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rsqrt14_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_rsqrt14_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_rsqrt14sd_mask((__v2df)__B, (__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rsqrt14_sd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_rsqrt14_sd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_rsqrt14sd_mask( (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rsqrt14_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_rsqrt14_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_rsqrt14ss((__v4sf)__B, (__v4sf)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rsqrt14_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_rsqrt14_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_rsqrt14ss_mask((__v4sf)__B, (__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rsqrt14_ss(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_rsqrt14_ss(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_rsqrt14ss_mask( (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sqrt_round_pd(__m512d __A, const int __R) { +__funline __m512d _mm512_sqrt_round_pd(__m512d __A, const int __R) { return (__m512d)__builtin_ia32_sqrtpd512_mask( (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sqrt_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m512d _mm512_mask_sqrt_round_pd(__m512d __W, __mmask8 __U, + __m512d __A, const int __R) { return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sqrt_round_pd(__mmask8 __U, __m512d __A, const int __R) { +__funline __m512d _mm512_maskz_sqrt_round_pd(__mmask8 __U, __m512d __A, + const int __R) { return (__m512d)__builtin_ia32_sqrtpd512_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sqrt_round_ps(__m512 __A, const int __R) { +__funline __m512 _mm512_sqrt_round_ps(__m512 __A, const int __R) { return (__m512)__builtin_ia32_sqrtps512_mask( (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sqrt_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - const int __R) { +__funline __m512 _mm512_mask_sqrt_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + const int __R) { return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sqrt_round_ps(__mmask16 __U, __m512 __A, const int __R) { +__funline __m512 _mm512_maskz_sqrt_round_ps(__mmask16 __U, __m512 __A, + const int __R) { return (__m512)__builtin_ia32_sqrtps512_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sqrt_round_sd(__m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_sqrt_round_sd(__m128d __A, __m128d __B, const int __R) { return (__m128d)__builtin_ia32_sqrtsd_mask_round( (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sqrt_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_mask_sqrt_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_sqrtsd_mask_round( (__v2df)__B, (__v2df)__A, (__v2df)__W, (__mmask8)__U, __R); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_sqrt_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_maskz_sqrt_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_sqrtsd_mask_round( (__v2df)__B, (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sqrt_round_ss(__m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_sqrt_round_ss(__m128 __A, __m128 __B, const int __R) { return (__m128)__builtin_ia32_sqrtss_mask_round( (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sqrt_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_sqrt_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_sqrtss_mask_round( (__v4sf)__B, (__v4sf)__A, (__v4sf)__W, (__mmask8)__U, __R); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_sqrt_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_maskz_sqrt_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_sqrtss_mask_round( (__v4sf)__B, (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); } @@ -1613,312 +1301,238 @@ _mm_maskz_sqrt_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { __builtin_ia32_sqrtss_mask_round(B, A, (__v4sf)_mm_setzero_ps(), U, C) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi8_epi32(__m128i __A) { +__funline __m512i _mm512_cvtepi8_epi32(__m128i __A) { return (__m512i)__builtin_ia32_pmovsxbd512_mask( (__v16qi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { +__funline __m512i _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, + __m128i __A) { return (__m512i)__builtin_ia32_pmovsxbd512_mask((__v16qi)__A, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) { +__funline __m512i _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) { return (__m512i)__builtin_ia32_pmovsxbd512_mask( (__v16qi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi8_epi64(__m128i __A) { +__funline __m512i _mm512_cvtepi8_epi64(__m128i __A) { return (__m512i)__builtin_ia32_pmovsxbq512_mask( (__v16qi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { +__funline __m512i _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, + __m128i __A) { return (__m512i)__builtin_ia32_pmovsxbq512_mask((__v16qi)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { +__funline __m512i _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_pmovsxbq512_mask( (__v16qi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi16_epi32(__m256i __A) { +__funline __m512i _mm512_cvtepi16_epi32(__m256i __A) { return (__m512i)__builtin_ia32_pmovsxwd512_mask( (__v16hi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { +__funline __m512i _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, + __m256i __A) { return (__m512i)__builtin_ia32_pmovsxwd512_mask((__v16hi)__A, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) { +__funline __m512i _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) { return (__m512i)__builtin_ia32_pmovsxwd512_mask( (__v16hi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi16_epi64(__m128i __A) { +__funline __m512i _mm512_cvtepi16_epi64(__m128i __A) { return (__m512i)__builtin_ia32_pmovsxwq512_mask( (__v8hi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { +__funline __m512i _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, + __m128i __A) { return (__m512i)__builtin_ia32_pmovsxwq512_mask((__v8hi)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { +__funline __m512i _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_pmovsxwq512_mask( (__v8hi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi32_epi64(__m256i __X) { +__funline __m512i _mm512_cvtepi32_epi64(__m256i __X) { return (__m512i)__builtin_ia32_pmovsxdq512_mask( (__v8si)__X, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { +__funline __m512i _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, + __m256i __X) { return (__m512i)__builtin_ia32_pmovsxdq512_mask((__v8si)__X, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) { +__funline __m512i _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) { return (__m512i)__builtin_ia32_pmovsxdq512_mask( (__v8si)__X, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepu8_epi32(__m128i __A) { +__funline __m512i _mm512_cvtepu8_epi32(__m128i __A) { return (__m512i)__builtin_ia32_pmovzxbd512_mask( (__v16qi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { +__funline __m512i _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, + __m128i __A) { return (__m512i)__builtin_ia32_pmovzxbd512_mask((__v16qi)__A, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) { +__funline __m512i _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) { return (__m512i)__builtin_ia32_pmovzxbd512_mask( (__v16qi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepu8_epi64(__m128i __A) { +__funline __m512i _mm512_cvtepu8_epi64(__m128i __A) { return (__m512i)__builtin_ia32_pmovzxbq512_mask( (__v16qi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { +__funline __m512i _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, + __m128i __A) { return (__m512i)__builtin_ia32_pmovzxbq512_mask((__v16qi)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { +__funline __m512i _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_pmovzxbq512_mask( (__v16qi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepu16_epi32(__m256i __A) { +__funline __m512i _mm512_cvtepu16_epi32(__m256i __A) { return (__m512i)__builtin_ia32_pmovzxwd512_mask( (__v16hi)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { +__funline __m512i _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, + __m256i __A) { return (__m512i)__builtin_ia32_pmovzxwd512_mask((__v16hi)__A, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) { +__funline __m512i _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) { return (__m512i)__builtin_ia32_pmovzxwd512_mask( (__v16hi)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepu16_epi64(__m128i __A) { +__funline __m512i _mm512_cvtepu16_epi64(__m128i __A) { return (__m512i)__builtin_ia32_pmovzxwq512_mask( (__v8hi)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { +__funline __m512i _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, + __m128i __A) { return (__m512i)__builtin_ia32_pmovzxwq512_mask((__v8hi)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { +__funline __m512i _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_pmovzxwq512_mask( (__v8hi)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepu32_epi64(__m256i __X) { +__funline __m512i _mm512_cvtepu32_epi64(__m256i __X) { return (__m512i)__builtin_ia32_pmovzxdq512_mask( (__v8si)__X, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { +__funline __m512i _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, + __m256i __X) { return (__m512i)__builtin_ia32_pmovzxdq512_mask((__v8si)__X, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) { +__funline __m512i _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) { return (__m512i)__builtin_ia32_pmovzxdq512_mask( (__v8si)__X, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_add_round_pd(__m512d __A, __m512d __B, const int __R) { +__funline __m512d _mm512_add_round_pd(__m512d __A, __m512d __B, const int __R) { return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_add_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { +__funline __m512d _mm512_mask_add_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_add_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - const int __R) { +__funline __m512d _mm512_maskz_add_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_add_round_ps(__m512 __A, __m512 __B, const int __R) { +__funline __m512 _mm512_add_round_ps(__m512 __A, __m512 __B, const int __R) { return (__m512)__builtin_ia32_addps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_add_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_mask_add_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { return (__m512)__builtin_ia32_addps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_add_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_maskz_add_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { return (__m512)__builtin_ia32_addps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sub_round_pd(__m512d __A, __m512d __B, const int __R) { +__funline __m512d _mm512_sub_round_pd(__m512d __A, __m512d __B, const int __R) { return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sub_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { +__funline __m512d _mm512_mask_sub_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - const int __R) { +__funline __m512d _mm512_maskz_sub_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sub_round_ps(__m512 __A, __m512 __B, const int __R) { +__funline __m512 _mm512_sub_round_ps(__m512 __A, __m512 __B, const int __R) { return (__m512)__builtin_ia32_subps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sub_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_mask_sub_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { return (__m512)__builtin_ia32_subps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_maskz_sub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { return (__m512)__builtin_ia32_subps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); @@ -1970,186 +1584,142 @@ extern __inline __m512 #endif #ifdef __OPTIMIZE__ -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mul_round_pd(__m512d __A, __m512d __B, const int __R) { +__funline __m512d _mm512_mul_round_pd(__m512d __A, __m512d __B, const int __R) { return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mul_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { +__funline __m512d _mm512_mask_mul_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mul_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - const int __R) { +__funline __m512d _mm512_maskz_mul_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mul_round_ps(__m512 __A, __m512 __B, const int __R) { +__funline __m512 _mm512_mul_round_ps(__m512 __A, __m512 __B, const int __R) { return (__m512)__builtin_ia32_mulps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mul_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_mask_mul_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { return (__m512)__builtin_ia32_mulps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mul_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_maskz_mul_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { return (__m512)__builtin_ia32_mulps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_div_round_pd(__m512d __M, __m512d __V, const int __R) { +__funline __m512d _mm512_div_round_pd(__m512d __M, __m512d __V, const int __R) { return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_div_round_pd(__m512d __W, __mmask8 __U, __m512d __M, - __m512d __V, const int __R) { +__funline __m512d _mm512_mask_div_round_pd(__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __R) { return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_div_round_pd(__mmask8 __U, __m512d __M, __m512d __V, - const int __R) { +__funline __m512d _mm512_maskz_div_round_pd(__mmask8 __U, __m512d __M, + __m512d __V, const int __R) { return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_div_round_ps(__m512 __A, __m512 __B, const int __R) { +__funline __m512 _mm512_div_round_ps(__m512 __A, __m512 __B, const int __R) { return (__m512)__builtin_ia32_divps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_div_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_mask_div_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { return (__m512)__builtin_ia32_divps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_div_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_maskz_div_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { return (__m512)__builtin_ia32_divps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mul_round_sd(__m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_mul_round_sd(__m128d __A, __m128d __B, const int __R) { return (__m128d)__builtin_ia32_mulsd_round((__v2df)__A, (__v2df)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mul_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_mask_mul_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_mulsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_mul_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_maskz_mul_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_mulsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mul_round_ss(__m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_mul_round_ss(__m128 __A, __m128 __B, const int __R) { return (__m128)__builtin_ia32_mulss_round((__v4sf)__A, (__v4sf)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mul_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_mul_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_mulss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_mul_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_maskz_mul_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_mulss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_div_round_sd(__m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_div_round_sd(__m128d __A, __m128d __B, const int __R) { return (__m128d)__builtin_ia32_divsd_round((__v2df)__A, (__v2df)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_div_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_mask_div_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_divsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_div_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_maskz_div_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_divsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_div_round_ss(__m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_div_round_ss(__m128 __A, __m128 __B, const int __R) { return (__m128)__builtin_ia32_divss_round((__v4sf)__A, (__v4sf)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_div_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_div_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_divss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_div_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_maskz_div_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_divss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); } @@ -2236,101 +1806,77 @@ _mm_maskz_div_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { #endif #ifdef __OPTIMIZE__ -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_round_pd(__m512d __A, __m512d __B, const int __R) { +__funline __m512d _mm512_max_round_pd(__m512d __A, __m512d __B, const int __R) { return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_max_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { +__funline __m512d _mm512_mask_max_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - const int __R) { +__funline __m512d _mm512_maskz_max_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_round_ps(__m512 __A, __m512 __B, const int __R) { +__funline __m512 _mm512_max_round_ps(__m512 __A, __m512 __B, const int __R) { return (__m512)__builtin_ia32_maxps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_max_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_mask_max_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { return (__m512)__builtin_ia32_maxps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_maskz_max_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { return (__m512)__builtin_ia32_maxps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_round_pd(__m512d __A, __m512d __B, const int __R) { +__funline __m512d _mm512_min_round_pd(__m512d __A, __m512d __B, const int __R) { return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_min_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { +__funline __m512d _mm512_mask_min_round_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - const int __R) { +__funline __m512d _mm512_maskz_min_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_round_ps(__m512 __A, __m512 __B, const int __R) { +__funline __m512 _mm512_min_round_ps(__m512 __A, __m512 __B, const int __R) { return (__m512)__builtin_ia32_minps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_min_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_mask_min_round_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) { return (__m512)__builtin_ia32_minps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_maskz_min_round_ps(__mmask16 __U, __m512 __A, __m512 __B, + const int __R) { return (__m512)__builtin_ia32_minps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); @@ -2382,97 +1928,77 @@ extern __inline __m512 #endif #ifdef __OPTIMIZE__ -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_scalef_round_pd(__m512d __A, __m512d __B, const int __R) { +__funline __m512d _mm512_scalef_round_pd(__m512d __A, __m512d __B, + const int __R) { return (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_scalef_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __R) { +__funline __m512d _mm512_mask_scalef_round_pd(__m512d __W, __mmask8 __U, + __m512d __A, __m512d __B, + const int __R) { return (__m512d)__builtin_ia32_scalefpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_scalef_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - const int __R) { +__funline __m512d _mm512_maskz_scalef_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, const int __R) { return (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_scalef_round_ps(__m512 __A, __m512 __B, const int __R) { +__funline __m512 _mm512_scalef_round_ps(__m512 __A, __m512 __B, const int __R) { return (__m512)__builtin_ia32_scalefps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_scalef_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - __m512 __B, const int __R) { +__funline __m512 _mm512_mask_scalef_round_ps(__m512 __W, __mmask16 __U, + __m512 __A, __m512 __B, + const int __R) { return (__m512)__builtin_ia32_scalefps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_scalef_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - const int __R) { +__funline __m512 _mm512_maskz_scalef_round_ps(__mmask16 __U, __m512 __A, + __m512 __B, const int __R) { return (__m512)__builtin_ia32_scalefps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_scalef_round_sd(__m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_scalef_round_sd(__m128d __A, __m128d __B, const int __R) { return (__m128d)__builtin_ia32_scalefsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)-1, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_scalef_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { +__funline __m128d _mm_mask_scalef_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_scalefsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_scalef_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_maskz_scalef_round_sd(__mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_scalefsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_scalef_round_ss(__m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_scalef_round_ss(__m128 __A, __m128 __B, const int __R) { return (__m128)__builtin_ia32_scalefss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_scalef_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_scalef_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_scalefss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_scalef_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_maskz_scalef_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_scalefss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); } @@ -2509,374 +2035,320 @@ _mm_maskz_scalef_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { #endif #ifdef __OPTIMIZE__ -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { +__funline __m512d _mm512_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmaddpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_mask_fmadd_round_pd(__m512d __A, __mmask8 __U, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmaddpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U, const int __R) { +__funline __m512d _mm512_mask3_fmadd_round_pd(__m512d __A, __m512d __B, + __m512d __C, __mmask8 __U, + const int __R) { return (__m512d)__builtin_ia32_vfmaddpd512_mask3( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_maskz_fmadd_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmaddpd512_maskz( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { +__funline __m512 _mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfmaddps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_mask_fmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { return (__m512)__builtin_ia32_vfmaddps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U, const int __R) { +__funline __m512 _mm512_mask3_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { return (__m512)__builtin_ia32_vfmaddps512_mask3( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_maskz_fmadd_round_ps(__mmask16 __U, __m512 __A, + __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfmaddps512_maskz( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { +__funline __m512d _mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmsubpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_mask_fmsub_round_pd(__m512d __A, __mmask8 __U, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmsubpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U, const int __R) { +__funline __m512d _mm512_mask3_fmsub_round_pd(__m512d __A, __m512d __B, + __m512d __C, __mmask8 __U, + const int __R) { return (__m512d)__builtin_ia32_vfmsubpd512_mask3( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_maskz_fmsub_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmsubpd512_maskz( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { +__funline __m512 _mm512_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfmsubps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_mask_fmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) { return (__m512)__builtin_ia32_vfmsubps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U, const int __R) { +__funline __m512 _mm512_mask3_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { return (__m512)__builtin_ia32_vfmsubps512_mask3( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_maskz_fmsub_round_ps(__mmask16 __U, __m512 __A, + __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfmsubps512_maskz( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { +__funline __m512d _mm512_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmaddsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_mask_fmaddsub_round_pd(__m512d __A, __mmask8 __U, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U, const int __R) { +__funline __m512d _mm512_mask3_fmaddsub_round_pd(__m512d __A, __m512d __B, + __m512d __C, __mmask8 __U, + const int __R) { return (__m512d)__builtin_ia32_vfmaddsubpd512_mask3( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmaddsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_maskz_fmaddsub_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { +__funline __m512 _mm512_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfmaddsubps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmaddsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_mask_fmaddsub_round_ps(__m512 __A, __mmask16 __U, + __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfmaddsubps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U, const int __R) { +__funline __m512 _mm512_mask3_fmaddsub_round_ps(__m512 __A, __m512 __B, + __m512 __C, __mmask16 __U, + const int __R) { return (__m512)__builtin_ia32_vfmaddsubps512_mask3( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmaddsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_maskz_fmaddsub_round_ps(__mmask16 __U, __m512 __A, + __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfmaddsubps512_maskz( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { +__funline __m512d _mm512_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmsubadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_mask_fmsubadd_round_pd(__m512d __A, __mmask8 __U, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U, const int __R) { +__funline __m512d _mm512_mask3_fmsubadd_round_pd(__m512d __A, __m512d __B, + __m512d __C, __mmask8 __U, + const int __R) { return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmsubadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_maskz_fmsubadd_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { +__funline __m512 _mm512_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfmaddsubps512_mask( (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmsubadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_mask_fmsubadd_round_ps(__m512 __A, __mmask16 __U, + __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfmaddsubps512_mask( (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U, const int __R) { +__funline __m512 _mm512_mask3_fmsubadd_round_ps(__m512 __A, __m512 __B, + __m512 __C, __mmask16 __U, + const int __R) { return (__m512)__builtin_ia32_vfmsubaddps512_mask3( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmsubadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_maskz_fmsubadd_round_ps(__mmask16 __U, __m512 __A, + __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfmaddsubps512_maskz( (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { +__funline __m512d _mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfnmaddpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fnmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_mask_fnmadd_round_pd(__m512d __A, __mmask8 __U, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfnmaddpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U, const int __R) { +__funline __m512d _mm512_mask3_fnmadd_round_pd(__m512d __A, __m512d __B, + __m512d __C, __mmask8 __U, + const int __R) { return (__m512d)__builtin_ia32_vfnmaddpd512_mask3( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fnmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_maskz_fnmadd_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfnmaddpd512_maskz( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { +__funline __m512 _mm512_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfnmaddps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fnmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_mask_fnmadd_round_ps(__m512 __A, __mmask16 __U, + __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfnmaddps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U, const int __R) { +__funline __m512 _mm512_mask3_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { return (__m512)__builtin_ia32_vfnmaddps512_mask3( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fnmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_maskz_fnmadd_round_ps(__mmask16 __U, __m512 __A, + __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfnmaddps512_maskz( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, const int __R) { +__funline __m512d _mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfnmsubpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fnmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_mask_fnmsub_round_pd(__m512d __A, __mmask8 __U, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfnmsubpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, - __mmask8 __U, const int __R) { +__funline __m512d _mm512_mask3_fnmsub_round_pd(__m512d __A, __m512d __B, + __m512d __C, __mmask8 __U, + const int __R) { return (__m512d)__builtin_ia32_vfnmsubpd512_mask3( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fnmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512d __C, const int __R) { +__funline __m512d _mm512_maskz_fnmsub_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, __m512d __C, + const int __R) { return (__m512d)__builtin_ia32_vfnmsubpd512_maskz( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, const int __R) { +__funline __m512 _mm512_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfnmsubps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fnmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_mask_fnmsub_round_ps(__m512 __A, __mmask16 __U, + __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfnmsubps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, - __mmask16 __U, const int __R) { +__funline __m512 _mm512_mask3_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) { return (__m512)__builtin_ia32_vfnmsubps512_mask3( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fnmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512 __C, const int __R) { +__funline __m512 _mm512_maskz_fnmsub_round_ps(__mmask16 __U, __m512 __A, + __m512 __B, __m512 __C, + const int __R) { return (__m512)__builtin_ia32_vfnmsubps512_maskz( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, __R); } @@ -3026,249 +2498,186 @@ extern __inline __m512 (__m512) __builtin_ia32_vfnmsubps512_maskz(A, B, C, U, R) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_abs_epi64(__m512i __A) { +__funline __m512i _mm512_abs_epi64(__m512i __A) { return (__m512i)__builtin_ia32_pabsq512_mask( (__v8di)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_abs_epi64(__m512i __W, __mmask8 __U, __m512i __A) { +__funline __m512i _mm512_mask_abs_epi64(__m512i __W, __mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_pabsq512_mask((__v8di)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_abs_epi64(__mmask8 __U, __m512i __A) { +__funline __m512i _mm512_maskz_abs_epi64(__mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_pabsq512_mask( (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_abs_epi32(__m512i __A) { +__funline __m512i _mm512_abs_epi32(__m512i __A) { return (__m512i)__builtin_ia32_pabsd512_mask( (__v16si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_abs_epi32(__m512i __W, __mmask16 __U, __m512i __A) { +__funline __m512i _mm512_mask_abs_epi32(__m512i __W, __mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_pabsd512_mask((__v16si)__A, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_abs_epi32(__mmask16 __U, __m512i __A) { +__funline __m512i _mm512_maskz_abs_epi32(__mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_pabsd512_mask( (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcastss_ps(__m128 __A) { +__funline __m512 _mm512_broadcastss_ps(__m128 __A) { return (__m512)__builtin_ia32_broadcastss512( (__v4sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) { +__funline __m512 _mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, + __m128 __A) { return (__m512)__builtin_ia32_broadcastss512((__v4sf)__A, (__v16sf)__O, __M); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) { +__funline __m512 _mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_broadcastss512( (__v4sf)__A, (__v16sf)_mm512_setzero_ps(), __M); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcastsd_pd(__m128d __A) { +__funline __m512d _mm512_broadcastsd_pd(__m128d __A) { return (__m512d)__builtin_ia32_broadcastsd512( (__v2df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) { +__funline __m512d _mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, + __m128d __A) { return (__m512d)__builtin_ia32_broadcastsd512((__v2df)__A, (__v8df)__O, __M); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) { +__funline __m512d _mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_broadcastsd512( (__v2df)__A, (__v8df)_mm512_setzero_pd(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcastd_epi32(__m128i __A) { +__funline __m512i _mm512_broadcastd_epi32(__m128i __A) { return (__m512i)__builtin_ia32_pbroadcastd512( (__v4si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) { +__funline __m512i _mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, + __m128i __A) { return (__m512i)__builtin_ia32_pbroadcastd512((__v4si)__A, (__v16si)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) { +__funline __m512i _mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_pbroadcastd512( (__v4si)__A, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set1_epi32(int __A) { +__funline __m512i _mm512_set1_epi32(int __A) { return (__m512i)__builtin_ia32_pbroadcastd512_gpr_mask( __A, (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1)); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_set1_epi32(__m512i __O, __mmask16 __M, int __A) { +__funline __m512i _mm512_mask_set1_epi32(__m512i __O, __mmask16 __M, int __A) { return (__m512i)__builtin_ia32_pbroadcastd512_gpr_mask(__A, (__v16si)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_set1_epi32(__mmask16 __M, int __A) { +__funline __m512i _mm512_maskz_set1_epi32(__mmask16 __M, int __A) { return (__m512i)__builtin_ia32_pbroadcastd512_gpr_mask( __A, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcastq_epi64(__m128i __A) { +__funline __m512i _mm512_broadcastq_epi64(__m128i __A) { return (__m512i)__builtin_ia32_pbroadcastq512( (__v2di)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) { +__funline __m512i _mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, + __m128i __A) { return (__m512i)__builtin_ia32_pbroadcastq512((__v2di)__A, (__v8di)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { +__funline __m512i _mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_pbroadcastq512( (__v2di)__A, (__v8di)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_set1_epi64(long long __A) { +__funline __m512i _mm512_set1_epi64(long long __A) { return (__m512i)__builtin_ia32_pbroadcastq512_gpr_mask( __A, (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1)); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_set1_epi64(__m512i __O, __mmask8 __M, long long __A) { +__funline __m512i _mm512_mask_set1_epi64(__m512i __O, __mmask8 __M, + long long __A) { return (__m512i)__builtin_ia32_pbroadcastq512_gpr_mask(__A, (__v8di)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) { +__funline __m512i _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) { return (__m512i)__builtin_ia32_pbroadcastq512_gpr_mask( __A, (__v8di)_mm512_setzero_si512(), __M); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcast_f32x4(__m128 __A) { +__funline __m512 _mm512_broadcast_f32x4(__m128 __A) { return (__m512)__builtin_ia32_broadcastf32x4_512( (__v4sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) { +__funline __m512 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, + __m128 __A) { return (__m512)__builtin_ia32_broadcastf32x4_512((__v4sf)__A, (__v16sf)__O, __M); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) { +__funline __m512 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_broadcastf32x4_512( (__v4sf)__A, (__v16sf)_mm512_setzero_ps(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcast_i32x4(__m128i __A) { +__funline __m512i _mm512_broadcast_i32x4(__m128i __A) { return (__m512i)__builtin_ia32_broadcasti32x4_512( (__v4si)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) { +__funline __m512i _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, + __m128i __A) { return (__m512i)__builtin_ia32_broadcasti32x4_512((__v4si)__A, (__v16si)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) { +__funline __m512i _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_broadcasti32x4_512( (__v4si)__A, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcast_f64x4(__m256d __A) { +__funline __m512d _mm512_broadcast_f64x4(__m256d __A) { return (__m512d)__builtin_ia32_broadcastf64x4_512( (__v4df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) { +__funline __m512d _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, + __m256d __A) { return (__m512d)__builtin_ia32_broadcastf64x4_512((__v4df)__A, (__v8df)__O, __M); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) { +__funline __m512d _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) { return (__m512d)__builtin_ia32_broadcastf64x4_512( (__v4df)__A, (__v8df)_mm512_setzero_pd(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_broadcast_i64x4(__m256i __A) { +__funline __m512i _mm512_broadcast_i64x4(__m256i __A) { return (__m512i)__builtin_ia32_broadcasti64x4_512( (__v4di)__A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) { +__funline __m512i _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, + __m256i __A) { return (__m512i)__builtin_ia32_broadcasti64x4_512((__v4di)__A, (__v8di)__O, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) { +__funline __m512i _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) { return (__m512i)__builtin_ia32_broadcasti64x4_512( (__v4di)__A, (__v8di)_mm512_setzero_si512(), __M); } @@ -3533,123 +2942,100 @@ typedef enum { } _MM_PERM_ENUM; #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shuffle_epi32(__m512i __A, _MM_PERM_ENUM __mask) { +__funline __m512i _mm512_shuffle_epi32(__m512i __A, _MM_PERM_ENUM __mask) { return (__m512i)__builtin_ia32_pshufd512_mask( (__v16si)__A, __mask, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shuffle_epi32(__m512i __W, __mmask16 __U, __m512i __A, - _MM_PERM_ENUM __mask) { +__funline __m512i _mm512_mask_shuffle_epi32(__m512i __W, __mmask16 __U, + __m512i __A, _MM_PERM_ENUM __mask) { return (__m512i)__builtin_ia32_pshufd512_mask((__v16si)__A, __mask, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_shuffle_epi32(__mmask16 __U, __m512i __A, _MM_PERM_ENUM __mask) { +__funline __m512i _mm512_maskz_shuffle_epi32(__mmask16 __U, __m512i __A, + _MM_PERM_ENUM __mask) { return (__m512i)__builtin_ia32_pshufd512_mask( (__v16si)__A, __mask, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shuffle_i64x2(__m512i __A, __m512i __B, const int __imm) { +__funline __m512i _mm512_shuffle_i64x2(__m512i __A, __m512i __B, + const int __imm) { return (__m512i)__builtin_ia32_shuf_i64x2_mask( (__v8di)__A, (__v8di)__B, __imm, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B, const int __imm) { +__funline __m512i _mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, + __m512i __A, __m512i __B, + const int __imm) { return (__m512i)__builtin_ia32_shuf_i64x2_mask( (__v8di)__A, (__v8di)__B, __imm, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B, - const int __imm) { +__funline __m512i _mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, + __m512i __B, const int __imm) { return (__m512i)__builtin_ia32_shuf_i64x2_mask( (__v8di)__A, (__v8di)__B, __imm, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shuffle_i32x4(__m512i __A, __m512i __B, const int __imm) { +__funline __m512i _mm512_shuffle_i32x4(__m512i __A, __m512i __B, + const int __imm) { return (__m512i)__builtin_ia32_shuf_i32x4_mask( (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B, const int __imm) { +__funline __m512i _mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B, + const int __imm) { return (__m512i)__builtin_ia32_shuf_i32x4_mask( (__v16si)__A, (__v16si)__B, __imm, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B, - const int __imm) { +__funline __m512i _mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, + __m512i __B, const int __imm) { return (__m512i)__builtin_ia32_shuf_i32x4_mask( (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shuffle_f64x2(__m512d __A, __m512d __B, const int __imm) { +__funline __m512d _mm512_shuffle_f64x2(__m512d __A, __m512d __B, + const int __imm) { return (__m512d)__builtin_ia32_shuf_f64x2_mask( (__v8df)__A, (__v8df)__B, __imm, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, - __m512d __B, const int __imm) { +__funline __m512d _mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, + __m512d __A, __m512d __B, + const int __imm) { return (__m512d)__builtin_ia32_shuf_f64x2_mask( (__v8df)__A, (__v8df)__B, __imm, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B, - const int __imm) { +__funline __m512d _mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, + __m512d __B, const int __imm) { return (__m512d)__builtin_ia32_shuf_f64x2_mask( (__v8df)__A, (__v8df)__B, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shuffle_f32x4(__m512 __A, __m512 __B, const int __imm) { +__funline __m512 _mm512_shuffle_f32x4(__m512 __A, __m512 __B, const int __imm) { return (__m512)__builtin_ia32_shuf_f32x4_mask( (__v16sf)__A, (__v16sf)__B, __imm, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B, - const int __imm) { +__funline __m512 _mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __imm) { return (__m512)__builtin_ia32_shuf_f32x4_mask( (__v16sf)__A, (__v16sf)__B, __imm, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B, - const int __imm) { +__funline __m512 _mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B, + const int __imm) { return (__m512)__builtin_ia32_shuf_f32x4_mask( (__v16sf)__A, (__v16sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); @@ -3731,135 +3117,109 @@ extern __inline __m512 (__v16sf)(__m512)_mm512_setzero_ps(), (__mmask16)(U))) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rolv_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_rolv_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_prolvd512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_rolv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_rolv_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_prolvd512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rolv_epi32(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_rolv_epi32(__mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_prolvd512_mask((__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rorv_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_rorv_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_prorvd512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_rorv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_rorv_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_prorvd512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rorv_epi32(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_rorv_epi32(__mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_prorvd512_mask((__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rolv_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_rolv_epi64(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_prolvq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_rolv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_rolv_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_prolvq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rolv_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_rolv_epi64(__mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_prolvq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rorv_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_rorv_epi64(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_prorvq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_rorv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_rorv_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_prorvq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rorv_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_rorv_epi64(__mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_prorvq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtt_roundpd_epi32(__m512d __A, const int __R) { +__funline __m256i _mm512_cvtt_roundpd_epi32(__m512d __A, const int __R) { return (__m256i)__builtin_ia32_cvttpd2dq512_mask( (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtt_roundpd_epi32(__m256i __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m256i _mm512_mask_cvtt_roundpd_epi32(__m256i __W, __mmask8 __U, + __m512d __A, const int __R) { return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)__A, (__v8si)__W, (__mmask8)__U, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtt_roundpd_epi32(__mmask8 __U, __m512d __A, const int __R) { +__funline __m256i _mm512_maskz_cvtt_roundpd_epi32(__mmask8 __U, __m512d __A, + const int __R) { return (__m256i)__builtin_ia32_cvttpd2dq512_mask( (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtt_roundpd_epu32(__m512d __A, const int __R) { +__funline __m256i _mm512_cvtt_roundpd_epu32(__m512d __A, const int __R) { return (__m256i)__builtin_ia32_cvttpd2udq512_mask( (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtt_roundpd_epu32(__m256i __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m256i _mm512_mask_cvtt_roundpd_epu32(__m256i __W, __mmask8 __U, + __m512d __A, const int __R) { return (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)__A, (__v8si)__W, (__mmask8)__U, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtt_roundpd_epu32(__mmask8 __U, __m512d __A, const int __R) { +__funline __m256i _mm512_maskz_cvtt_roundpd_epu32(__mmask8 __U, __m512d __A, + const int __R) { return (__m256i)__builtin_ia32_cvttpd2udq512_mask( (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); } @@ -3888,46 +3248,36 @@ extern __inline __m256i #endif #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundpd_epi32(__m512d __A, const int __R) { +__funline __m256i _mm512_cvt_roundpd_epi32(__m512d __A, const int __R) { return (__m256i)__builtin_ia32_cvtpd2dq512_mask( (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundpd_epi32(__m256i __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m256i _mm512_mask_cvt_roundpd_epi32(__m256i __W, __mmask8 __U, + __m512d __A, const int __R) { return (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)__A, (__v8si)__W, (__mmask8)__U, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundpd_epi32(__mmask8 __U, __m512d __A, const int __R) { +__funline __m256i _mm512_maskz_cvt_roundpd_epi32(__mmask8 __U, __m512d __A, + const int __R) { return (__m256i)__builtin_ia32_cvtpd2dq512_mask( (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundpd_epu32(__m512d __A, const int __R) { +__funline __m256i _mm512_cvt_roundpd_epu32(__m512d __A, const int __R) { return (__m256i)__builtin_ia32_cvtpd2udq512_mask( (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundpd_epu32(__m256i __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m256i _mm512_mask_cvt_roundpd_epu32(__m256i __W, __mmask8 __U, + __m512d __A, const int __R) { return (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)__A, (__v8si)__W, (__mmask8)__U, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundpd_epu32(__mmask8 __U, __m512d __A, const int __R) { +__funline __m256i _mm512_maskz_cvt_roundpd_epu32(__mmask8 __U, __m512d __A, + const int __R) { return (__m256i)__builtin_ia32_cvtpd2udq512_mask( (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, __R); } @@ -3956,46 +3306,36 @@ extern __inline __m256i #endif #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtt_roundps_epi32(__m512 __A, const int __R) { +__funline __m512i _mm512_cvtt_roundps_epi32(__m512 __A, const int __R) { return (__m512i)__builtin_ia32_cvttps2dq512_mask( (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtt_roundps_epi32(__m512i __W, __mmask16 __U, __m512 __A, - const int __R) { +__funline __m512i _mm512_mask_cvtt_roundps_epi32(__m512i __W, __mmask16 __U, + __m512 __A, const int __R) { return (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)__A, (__v16si)__W, (__mmask16)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtt_roundps_epi32(__mmask16 __U, __m512 __A, const int __R) { +__funline __m512i _mm512_maskz_cvtt_roundps_epi32(__mmask16 __U, __m512 __A, + const int __R) { return (__m512i)__builtin_ia32_cvttps2dq512_mask( (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtt_roundps_epu32(__m512 __A, const int __R) { +__funline __m512i _mm512_cvtt_roundps_epu32(__m512 __A, const int __R) { return (__m512i)__builtin_ia32_cvttps2udq512_mask( (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtt_roundps_epu32(__m512i __W, __mmask16 __U, __m512 __A, - const int __R) { +__funline __m512i _mm512_mask_cvtt_roundps_epu32(__m512i __W, __mmask16 __U, + __m512 __A, const int __R) { return (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)__A, (__v16si)__W, (__mmask16)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtt_roundps_epu32(__mmask16 __U, __m512 __A, const int __R) { +__funline __m512i _mm512_maskz_cvtt_roundps_epu32(__mmask16 __U, __m512 __A, + const int __R) { return (__m512i)__builtin_ia32_cvttps2udq512_mask( (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); } @@ -4024,46 +3364,36 @@ extern __inline __m512i #endif #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundps_epi32(__m512 __A, const int __R) { +__funline __m512i _mm512_cvt_roundps_epi32(__m512 __A, const int __R) { return (__m512i)__builtin_ia32_cvtps2dq512_mask( (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundps_epi32(__m512i __W, __mmask16 __U, __m512 __A, - const int __R) { +__funline __m512i _mm512_mask_cvt_roundps_epi32(__m512i __W, __mmask16 __U, + __m512 __A, const int __R) { return (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)__A, (__v16si)__W, (__mmask16)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundps_epi32(__mmask16 __U, __m512 __A, const int __R) { +__funline __m512i _mm512_maskz_cvt_roundps_epi32(__mmask16 __U, __m512 __A, + const int __R) { return (__m512i)__builtin_ia32_cvtps2dq512_mask( (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundps_epu32(__m512 __A, const int __R) { +__funline __m512i _mm512_cvt_roundps_epu32(__m512 __A, const int __R) { return (__m512i)__builtin_ia32_cvtps2udq512_mask( (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundps_epu32(__m512i __W, __mmask16 __U, __m512 __A, - const int __R) { +__funline __m512i _mm512_mask_cvt_roundps_epu32(__m512i __W, __mmask16 __U, + __m512 __A, const int __R) { return (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)__A, (__v16si)__W, (__mmask16)__U, __R); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundps_epu32(__mmask16 __U, __m512 __A, const int __R) { +__funline __m512i _mm512_maskz_cvt_roundps_epu32(__mmask16 __U, __m512 __A, + const int __R) { return (__m512i)__builtin_ia32_cvtps2udq512_mask( (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, __R); } @@ -4091,29 +3421,23 @@ extern __inline __m512i A, (__v16si)_mm512_setzero_si512(), U, B)) #endif -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtu32_sd(__m128d __A, unsigned __B) { +__funline __m128d _mm_cvtu32_sd(__m128d __A, unsigned __B) { return (__m128d)__builtin_ia32_cvtusi2sd32((__v2df)__A, __B); } #ifdef __x86_64__ #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundu64_sd(__m128d __A, unsigned long long __B, const int __R) { +__funline __m128d _mm_cvt_roundu64_sd(__m128d __A, unsigned long long __B, + const int __R) { return (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)__A, __B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundi64_sd(__m128d __A, long long __B, const int __R) { +__funline __m128d _mm_cvt_roundi64_sd(__m128d __A, long long __B, const int __R) { return (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)__A, __B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundsi64_sd(__m128d __A, long long __B, const int __R) { +__funline __m128d _mm_cvt_roundsi64_sd(__m128d __A, long long __B, + const int __R) { return (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)__A, __B, __R); } #else @@ -4130,21 +3454,15 @@ extern __inline __m128d #endif #ifdef __OPTIMIZE__ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundu32_ss(__m128 __A, unsigned __B, const int __R) { +__funline __m128 _mm_cvt_roundu32_ss(__m128 __A, unsigned __B, const int __R) { return (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)__A, __B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundsi32_ss(__m128 __A, int __B, const int __R) { +__funline __m128 _mm_cvt_roundsi32_ss(__m128 __A, int __B, const int __R) { return (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)__A, __B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundi32_ss(__m128 __A, int __B, const int __R) { +__funline __m128 _mm_cvt_roundi32_ss(__m128 __A, int __B, const int __R) { return (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)__A, __B, __R); } #else @@ -4159,21 +3477,16 @@ extern __inline __m128 #ifdef __x86_64__ #ifdef __OPTIMIZE__ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundu64_ss(__m128 __A, unsigned long long __B, const int __R) { +__funline __m128 _mm_cvt_roundu64_ss(__m128 __A, unsigned long long __B, + const int __R) { return (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)__A, __B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundsi64_ss(__m128 __A, long long __B, const int __R) { +__funline __m128 _mm_cvt_roundsi64_ss(__m128 __A, long long __B, const int __R) { return (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)__A, __B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundi64_ss(__m128 __A, long long __B, const int __R) { +__funline __m128 _mm_cvt_roundi64_ss(__m128 __A, long long __B, const int __R) { return (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)__A, __B, __R); } #else @@ -4188,489 +3501,379 @@ extern __inline __m128 #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi32_epi8(__m512i __A) { +__funline __m128i _mm512_cvtepi32_epi8(__m512i __A) { return (__m128i)__builtin_ia32_pmovdb512_mask( (__v16si)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi32_storeu_epi8(void *__P, __mmask16 __M, __m512i __A) { +__funline void _mm512_mask_cvtepi32_storeu_epi8(void *__P, __mmask16 __M, + __m512i __A) { __builtin_ia32_pmovdb512mem_mask((__v16qi *)__P, (__v16si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) { +__funline __m128i _mm512_mask_cvtepi32_epi8(__m128i __O, __mmask16 __M, + __m512i __A) { return (__m128i)__builtin_ia32_pmovdb512_mask((__v16si)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi32_epi8(__mmask16 __M, __m512i __A) { +__funline __m128i _mm512_maskz_cvtepi32_epi8(__mmask16 __M, __m512i __A) { return (__m128i)__builtin_ia32_pmovdb512_mask( (__v16si)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtsepi32_epi8(__m512i __A) { +__funline __m128i _mm512_cvtsepi32_epi8(__m512i __A) { return (__m128i)__builtin_ia32_pmovsdb512_mask( (__v16si)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi32_storeu_epi8(void *__P, __mmask16 __M, __m512i __A) { +__funline void _mm512_mask_cvtsepi32_storeu_epi8(void *__P, __mmask16 __M, + __m512i __A) { __builtin_ia32_pmovsdb512mem_mask((__v16qi *)__P, (__v16si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) { +__funline __m128i _mm512_mask_cvtsepi32_epi8(__m128i __O, __mmask16 __M, + __m512i __A) { return (__m128i)__builtin_ia32_pmovsdb512_mask((__v16si)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtsepi32_epi8(__mmask16 __M, __m512i __A) { +__funline __m128i _mm512_maskz_cvtsepi32_epi8(__mmask16 __M, __m512i __A) { return (__m128i)__builtin_ia32_pmovsdb512_mask( (__v16si)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtusepi32_epi8(__m512i __A) { +__funline __m128i _mm512_cvtusepi32_epi8(__m512i __A) { return (__m128i)__builtin_ia32_pmovusdb512_mask( (__v16si)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi32_storeu_epi8(void *__P, __mmask16 __M, __m512i __A) { +__funline void _mm512_mask_cvtusepi32_storeu_epi8(void *__P, __mmask16 __M, + __m512i __A) { __builtin_ia32_pmovusdb512mem_mask((__v16qi *)__P, (__v16si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) { +__funline __m128i _mm512_mask_cvtusepi32_epi8(__m128i __O, __mmask16 __M, + __m512i __A) { return (__m128i)__builtin_ia32_pmovusdb512_mask((__v16si)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtusepi32_epi8(__mmask16 __M, __m512i __A) { +__funline __m128i _mm512_maskz_cvtusepi32_epi8(__mmask16 __M, __m512i __A) { return (__m128i)__builtin_ia32_pmovusdb512_mask( (__v16si)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi32_epi16(__m512i __A) { +__funline __m256i _mm512_cvtepi32_epi16(__m512i __A) { return (__m256i)__builtin_ia32_pmovdw512_mask( (__v16si)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) { +__funline void _mm512_mask_cvtepi32_storeu_epi16(void *__P, __mmask16 __M, + __m512i __A) { __builtin_ia32_pmovdw512mem_mask((__v16hi *)__P, (__v16si)__A, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) { +__funline __m256i _mm512_mask_cvtepi32_epi16(__m256i __O, __mmask16 __M, + __m512i __A) { return (__m256i)__builtin_ia32_pmovdw512_mask((__v16si)__A, (__v16hi)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi32_epi16(__mmask16 __M, __m512i __A) { +__funline __m256i _mm512_maskz_cvtepi32_epi16(__mmask16 __M, __m512i __A) { return (__m256i)__builtin_ia32_pmovdw512_mask( (__v16si)__A, (__v16hi)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtsepi32_epi16(__m512i __A) { +__funline __m256i _mm512_cvtsepi32_epi16(__m512i __A) { return (__m256i)__builtin_ia32_pmovsdw512_mask( (__v16si)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) { +__funline void _mm512_mask_cvtsepi32_storeu_epi16(void *__P, __mmask16 __M, + __m512i __A) { __builtin_ia32_pmovsdw512mem_mask((__v16hi *)__P, (__v16si)__A, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) { +__funline __m256i _mm512_mask_cvtsepi32_epi16(__m256i __O, __mmask16 __M, + __m512i __A) { return (__m256i)__builtin_ia32_pmovsdw512_mask((__v16si)__A, (__v16hi)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtsepi32_epi16(__mmask16 __M, __m512i __A) { +__funline __m256i _mm512_maskz_cvtsepi32_epi16(__mmask16 __M, __m512i __A) { return (__m256i)__builtin_ia32_pmovsdw512_mask( (__v16si)__A, (__v16hi)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtusepi32_epi16(__m512i __A) { +__funline __m256i _mm512_cvtusepi32_epi16(__m512i __A) { return (__m256i)__builtin_ia32_pmovusdw512_mask( (__v16si)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) { +__funline void _mm512_mask_cvtusepi32_storeu_epi16(void *__P, __mmask16 __M, + __m512i __A) { __builtin_ia32_pmovusdw512mem_mask((__v16hi *)__P, (__v16si)__A, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) { +__funline __m256i _mm512_mask_cvtusepi32_epi16(__m256i __O, __mmask16 __M, + __m512i __A) { return (__m256i)__builtin_ia32_pmovusdw512_mask((__v16si)__A, (__v16hi)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtusepi32_epi16(__mmask16 __M, __m512i __A) { +__funline __m256i _mm512_maskz_cvtusepi32_epi16(__mmask16 __M, __m512i __A) { return (__m256i)__builtin_ia32_pmovusdw512_mask( (__v16si)__A, (__v16hi)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi64_epi32(__m512i __A) { +__funline __m256i _mm512_cvtepi64_epi32(__m512i __A) { return (__m256i)__builtin_ia32_pmovqd512_mask( (__v8di)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, __m512i __A) { +__funline void _mm512_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, + __m512i __A) { __builtin_ia32_pmovqd512mem_mask((__v8si *)__P, (__v8di)__A, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) { +__funline __m256i _mm512_mask_cvtepi64_epi32(__m256i __O, __mmask8 __M, + __m512i __A) { return (__m256i)__builtin_ia32_pmovqd512_mask((__v8di)__A, (__v8si)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi64_epi32(__mmask8 __M, __m512i __A) { +__funline __m256i _mm512_maskz_cvtepi64_epi32(__mmask8 __M, __m512i __A) { return (__m256i)__builtin_ia32_pmovqd512_mask( (__v8di)__A, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtsepi64_epi32(__m512i __A) { +__funline __m256i _mm512_cvtsepi64_epi32(__m512i __A) { return (__m256i)__builtin_ia32_pmovsqd512_mask( (__v8di)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, __m512i __A) { +__funline void _mm512_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, + __m512i __A) { __builtin_ia32_pmovsqd512mem_mask((__v8si *)__P, (__v8di)__A, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) { +__funline __m256i _mm512_mask_cvtsepi64_epi32(__m256i __O, __mmask8 __M, + __m512i __A) { return (__m256i)__builtin_ia32_pmovsqd512_mask((__v8di)__A, (__v8si)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtsepi64_epi32(__mmask8 __M, __m512i __A) { +__funline __m256i _mm512_maskz_cvtsepi64_epi32(__mmask8 __M, __m512i __A) { return (__m256i)__builtin_ia32_pmovsqd512_mask( (__v8di)__A, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtusepi64_epi32(__m512i __A) { +__funline __m256i _mm512_cvtusepi64_epi32(__m512i __A) { return (__m256i)__builtin_ia32_pmovusqd512_mask( (__v8di)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, __m512i __A) { +__funline void _mm512_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, + __m512i __A) { __builtin_ia32_pmovusqd512mem_mask((__v8si *)__P, (__v8di)__A, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) { +__funline __m256i _mm512_mask_cvtusepi64_epi32(__m256i __O, __mmask8 __M, + __m512i __A) { return (__m256i)__builtin_ia32_pmovusqd512_mask((__v8di)__A, (__v8si)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtusepi64_epi32(__mmask8 __M, __m512i __A) { +__funline __m256i _mm512_maskz_cvtusepi64_epi32(__mmask8 __M, __m512i __A) { return (__m256i)__builtin_ia32_pmovusqd512_mask( (__v8di)__A, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi64_epi16(__m512i __A) { +__funline __m128i _mm512_cvtepi64_epi16(__m512i __A) { return (__m128i)__builtin_ia32_pmovqw512_mask( (__v8di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) { +__funline void _mm512_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, + __m512i __A) { __builtin_ia32_pmovqw512mem_mask((__v8hi *)__P, (__v8di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) { +__funline __m128i _mm512_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, + __m512i __A) { return (__m128i)__builtin_ia32_pmovqw512_mask((__v8di)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi64_epi16(__mmask8 __M, __m512i __A) { +__funline __m128i _mm512_maskz_cvtepi64_epi16(__mmask8 __M, __m512i __A) { return (__m128i)__builtin_ia32_pmovqw512_mask( (__v8di)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtsepi64_epi16(__m512i __A) { +__funline __m128i _mm512_cvtsepi64_epi16(__m512i __A) { return (__m128i)__builtin_ia32_pmovsqw512_mask( (__v8di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) { +__funline void _mm512_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, + __m512i __A) { __builtin_ia32_pmovsqw512mem_mask((__v8hi *)__P, (__v8di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) { +__funline __m128i _mm512_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, + __m512i __A) { return (__m128i)__builtin_ia32_pmovsqw512_mask((__v8di)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtsepi64_epi16(__mmask8 __M, __m512i __A) { +__funline __m128i _mm512_maskz_cvtsepi64_epi16(__mmask8 __M, __m512i __A) { return (__m128i)__builtin_ia32_pmovsqw512_mask( (__v8di)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtusepi64_epi16(__m512i __A) { +__funline __m128i _mm512_cvtusepi64_epi16(__m512i __A) { return (__m128i)__builtin_ia32_pmovusqw512_mask( (__v8di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) { +__funline void _mm512_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, + __m512i __A) { __builtin_ia32_pmovusqw512mem_mask((__v8hi *)__P, (__v8di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) { +__funline __m128i _mm512_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, + __m512i __A) { return (__m128i)__builtin_ia32_pmovusqw512_mask((__v8di)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtusepi64_epi16(__mmask8 __M, __m512i __A) { +__funline __m128i _mm512_maskz_cvtusepi64_epi16(__mmask8 __M, __m512i __A) { return (__m128i)__builtin_ia32_pmovusqw512_mask( (__v8di)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi64_epi8(__m512i __A) { +__funline __m128i _mm512_cvtepi64_epi8(__m512i __A) { return (__m128i)__builtin_ia32_pmovqb512_mask( (__v8di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, __m512i __A) { +__funline void _mm512_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, + __m512i __A) { __builtin_ia32_pmovqb512mem_mask((__v16qi *)__P, (__v8di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) { +__funline __m128i _mm512_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, + __m512i __A) { return (__m128i)__builtin_ia32_pmovqb512_mask((__v8di)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi64_epi8(__mmask8 __M, __m512i __A) { +__funline __m128i _mm512_maskz_cvtepi64_epi8(__mmask8 __M, __m512i __A) { return (__m128i)__builtin_ia32_pmovqb512_mask( (__v8di)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtsepi64_epi8(__m512i __A) { +__funline __m128i _mm512_cvtsepi64_epi8(__m512i __A) { return (__m128i)__builtin_ia32_pmovsqb512_mask( (__v8di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, __m512i __A) { +__funline void _mm512_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, + __m512i __A) { __builtin_ia32_pmovsqb512mem_mask((__v16qi *)__P, (__v8di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) { +__funline __m128i _mm512_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, + __m512i __A) { return (__m128i)__builtin_ia32_pmovsqb512_mask((__v8di)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtsepi64_epi8(__mmask8 __M, __m512i __A) { +__funline __m128i _mm512_maskz_cvtsepi64_epi8(__mmask8 __M, __m512i __A) { return (__m128i)__builtin_ia32_pmovsqb512_mask( (__v8di)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtusepi64_epi8(__m512i __A) { +__funline __m128i _mm512_cvtusepi64_epi8(__m512i __A) { return (__m128i)__builtin_ia32_pmovusqb512_mask( (__v8di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, __m512i __A) { +__funline void _mm512_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, + __m512i __A) { __builtin_ia32_pmovusqb512mem_mask((__v16qi *)__P, (__v8di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) { +__funline __m128i _mm512_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, + __m512i __A) { return (__m128i)__builtin_ia32_pmovusqb512_mask((__v8di)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtusepi64_epi8(__mmask8 __M, __m512i __A) { +__funline __m128i _mm512_maskz_cvtusepi64_epi8(__mmask8 __M, __m512i __A) { return (__m128i)__builtin_ia32_pmovusqb512_mask( (__v8di)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi32_pd(__m256i __A) { +__funline __m512d _mm512_cvtepi32_pd(__m256i __A) { return (__m512d)__builtin_ia32_cvtdq2pd512_mask( (__v8si)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi32_pd(__m512d __W, __mmask8 __U, __m256i __A) { +__funline __m512d _mm512_mask_cvtepi32_pd(__m512d __W, __mmask8 __U, + __m256i __A) { return (__m512d)__builtin_ia32_cvtdq2pd512_mask((__v8si)__A, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi32_pd(__mmask8 __U, __m256i __A) { +__funline __m512d _mm512_maskz_cvtepi32_pd(__mmask8 __U, __m256i __A) { return (__m512d)__builtin_ia32_cvtdq2pd512_mask( (__v8si)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepu32_pd(__m256i __A) { +__funline __m512d _mm512_cvtepu32_pd(__m256i __A) { return (__m512d)__builtin_ia32_cvtudq2pd512_mask( (__v8si)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepu32_pd(__m512d __W, __mmask8 __U, __m256i __A) { +__funline __m512d _mm512_mask_cvtepu32_pd(__m512d __W, __mmask8 __U, + __m256i __A) { return (__m512d)__builtin_ia32_cvtudq2pd512_mask((__v8si)__A, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepu32_pd(__mmask8 __U, __m256i __A) { +__funline __m512d _mm512_maskz_cvtepu32_pd(__mmask8 __U, __m256i __A) { return (__m512d)__builtin_ia32_cvtudq2pd512_mask( (__v8si)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundepi32_ps(__m512i __A, const int __R) { +__funline __m512 _mm512_cvt_roundepi32_ps(__m512i __A, const int __R) { return (__m512)__builtin_ia32_cvtdq2ps512_mask( (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundepi32_ps(__m512 __W, __mmask16 __U, __m512i __A, - const int __R) { +__funline __m512 _mm512_mask_cvt_roundepi32_ps(__m512 __W, __mmask16 __U, + __m512i __A, const int __R) { return (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)__A, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundepi32_ps(__mmask16 __U, __m512i __A, const int __R) { +__funline __m512 _mm512_maskz_cvt_roundepi32_ps(__mmask16 __U, __m512i __A, + const int __R) { return (__m512)__builtin_ia32_cvtdq2ps512_mask( (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundepu32_ps(__m512i __A, const int __R) { +__funline __m512 _mm512_cvt_roundepu32_ps(__m512i __A, const int __R) { return (__m512)__builtin_ia32_cvtudq2ps512_mask( (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundepu32_ps(__m512 __W, __mmask16 __U, __m512i __A, - const int __R) { +__funline __m512 _mm512_mask_cvt_roundepu32_ps(__m512 __W, __mmask16 __U, + __m512i __A, const int __R) { return (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)__A, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundepu32_ps(__mmask16 __U, __m512i __A, const int __R) { +__funline __m512 _mm512_maskz_cvt_roundepu32_ps(__mmask16 __U, __m512i __A, + const int __R) { return (__m512)__builtin_ia32_cvtudq2ps512_mask( (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } @@ -4700,90 +3903,70 @@ extern __inline __m512 #endif #ifdef __OPTIMIZE__ -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_extractf64x4_pd(__m512d __A, const int __imm) { +__funline __m256d _mm512_extractf64x4_pd(__m512d __A, const int __imm) { return (__m256d)__builtin_ia32_extractf64x4_mask( (__v8df)__A, __imm, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_extractf64x4_pd(__m256d __W, __mmask8 __U, __m512d __A, - const int __imm) { +__funline __m256d _mm512_mask_extractf64x4_pd(__m256d __W, __mmask8 __U, + __m512d __A, const int __imm) { return (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)__A, __imm, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_extractf64x4_pd(__mmask8 __U, __m512d __A, const int __imm) { +__funline __m256d _mm512_maskz_extractf64x4_pd(__mmask8 __U, __m512d __A, + const int __imm) { return (__m256d)__builtin_ia32_extractf64x4_mask( (__v8df)__A, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_extractf32x4_ps(__m512 __A, const int __imm) { +__funline __m128 _mm512_extractf32x4_ps(__m512 __A, const int __imm) { return (__m128)__builtin_ia32_extractf32x4_mask( (__v16sf)__A, __imm, (__v4sf)_mm_undefined_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m512 __A, - const int __imm) { +__funline __m128 _mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m512 __A, + const int __imm) { return (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)__A, __imm, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_extractf32x4_ps(__mmask8 __U, __m512 __A, const int __imm) { +__funline __m128 _mm512_maskz_extractf32x4_ps(__mmask8 __U, __m512 __A, + const int __imm) { return (__m128)__builtin_ia32_extractf32x4_mask( (__v16sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_extracti64x4_epi64(__m512i __A, const int __imm) { +__funline __m256i _mm512_extracti64x4_epi64(__m512i __A, const int __imm) { return (__m256i)__builtin_ia32_extracti64x4_mask( (__v8di)__A, __imm, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A, - const int __imm) { +__funline __m256i _mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, + __m512i __A, const int __imm) { return (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)__A, __imm, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A, const int __imm) { +__funline __m256i _mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A, + const int __imm) { return (__m256i)__builtin_ia32_extracti64x4_mask( (__v8di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_extracti32x4_epi32(__m512i __A, const int __imm) { +__funline __m128i _mm512_extracti32x4_epi32(__m512i __A, const int __imm) { return (__m128i)__builtin_ia32_extracti32x4_mask( (__v16si)__A, __imm, (__v4si)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A, - const int __imm) { +__funline __m128i _mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, + __m512i __A, const int __imm) { return (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)__A, __imm, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A, const int __imm) { +__funline __m128i _mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A, + const int __imm) { return (__m128i)__builtin_ia32_extracti32x4_mask( (__v16si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } @@ -4847,65 +4030,49 @@ _mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A, const int __imm) { #endif #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_inserti32x4(__m512i __A, __m128i __B, const int __imm) { +__funline __m512i _mm512_inserti32x4(__m512i __A, __m128i __B, const int __imm) { return (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)__A, (__v4si)__B, __imm, (__v16si)__A, -1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_insertf32x4(__m512 __A, __m128 __B, const int __imm) { +__funline __m512 _mm512_insertf32x4(__m512 __A, __m128 __B, const int __imm) { return (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)__A, (__v4sf)__B, __imm, (__v16sf)__A, -1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_inserti64x4(__m512i __A, __m256i __B, const int __imm) { +__funline __m512i _mm512_inserti64x4(__m512i __A, __m256i __B, const int __imm) { return (__m512i)__builtin_ia32_inserti64x4_mask( (__v8di)__A, (__v4di)__B, __imm, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_inserti64x4(__m512i __W, __mmask8 __U, __m512i __A, __m256i __B, - const int __imm) { +__funline __m512i _mm512_mask_inserti64x4(__m512i __W, __mmask8 __U, __m512i __A, + __m256i __B, const int __imm) { return (__m512i)__builtin_ia32_inserti64x4_mask( (__v8di)__A, (__v4di)__B, __imm, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_inserti64x4(__mmask8 __U, __m512i __A, __m256i __B, - const int __imm) { +__funline __m512i _mm512_maskz_inserti64x4(__mmask8 __U, __m512i __A, __m256i __B, + const int __imm) { return (__m512i)__builtin_ia32_inserti64x4_mask( (__v8di)__A, (__v4di)__B, __imm, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_insertf64x4(__m512d __A, __m256d __B, const int __imm) { +__funline __m512d _mm512_insertf64x4(__m512d __A, __m256d __B, const int __imm) { return (__m512d)__builtin_ia32_insertf64x4_mask( (__v8df)__A, (__v4df)__B, __imm, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_insertf64x4(__m512d __W, __mmask8 __U, __m512d __A, __m256d __B, - const int __imm) { +__funline __m512d _mm512_mask_insertf64x4(__m512d __W, __mmask8 __U, __m512d __A, + __m256d __B, const int __imm) { return (__m512d)__builtin_ia32_insertf64x4_mask( (__v8df)__A, (__v4df)__B, __imm, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_insertf64x4(__mmask8 __U, __m512d __A, __m256d __B, - const int __imm) { +__funline __m512d _mm512_maskz_insertf64x4(__mmask8 __U, __m512d __A, __m256d __B, + const int __imm) { return (__m512d)__builtin_ia32_insertf64x4_mask( (__v8df)__A, (__v4df)__B, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); @@ -4952,254 +4119,191 @@ extern __inline __m512d (__v8di)(__m512i)_mm512_setzero_si512(), (__mmask8)(U))) #endif -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_loadu_pd(void const *__P) { +__funline __m512d _mm512_loadu_pd(void const *__P) { return *(__m512d_u *)__P; } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_loadu_pd(__m512d __W, __mmask8 __U, void const *__P) { +__funline __m512d _mm512_mask_loadu_pd(__m512d __W, __mmask8 __U, + void const *__P) { return (__m512d)__builtin_ia32_loadupd512_mask((const double *)__P, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) { +__funline __m512d _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) { return (__m512d)__builtin_ia32_loadupd512_mask( (const double *)__P, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_storeu_pd(void *__P, __m512d __A) { +__funline void _mm512_storeu_pd(void *__P, __m512d __A) { *(__m512d_u *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) { +__funline void _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) { __builtin_ia32_storeupd512_mask((double *)__P, (__v8df)__A, (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_loadu_ps(void const *__P) { +__funline __m512 _mm512_loadu_ps(void const *__P) { return *(__m512_u *)__P; } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_loadu_ps(__m512 __W, __mmask16 __U, void const *__P) { +__funline __m512 _mm512_mask_loadu_ps(__m512 __W, __mmask16 __U, + void const *__P) { return (__m512)__builtin_ia32_loadups512_mask((const float *)__P, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) { +__funline __m512 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) { return (__m512)__builtin_ia32_loadups512_mask( (const float *)__P, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_storeu_ps(void *__P, __m512 __A) { +__funline void _mm512_storeu_ps(void *__P, __m512 __A) { *(__m512_u *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) { +__funline void _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) { __builtin_ia32_storeups512_mask((float *)__P, (__v16sf)__A, (__mmask16)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_load_ss(__m128 __W, __mmask8 __U, const float *__P) { +__funline __m128 _mm_mask_load_ss(__m128 __W, __mmask8 __U, const float *__P) { return (__m128)__builtin_ia32_loadss_mask(__P, (__v4sf)__W, __U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_load_ss(__mmask8 __U, const float *__P) { +__funline __m128 _mm_maskz_load_ss(__mmask8 __U, const float *__P) { return (__m128)__builtin_ia32_loadss_mask(__P, (__v4sf)_mm_setzero_ps(), __U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_load_sd(__m128d __W, __mmask8 __U, const double *__P) { +__funline __m128d _mm_mask_load_sd(__m128d __W, __mmask8 __U, const double *__P) { return (__m128d)__builtin_ia32_loadsd_mask(__P, (__v2df)__W, __U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_load_sd(__mmask8 __U, const double *__P) { +__funline __m128d _mm_maskz_load_sd(__mmask8 __U, const double *__P) { return (__m128d)__builtin_ia32_loadsd_mask(__P, (__v2df)_mm_setzero_pd(), __U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_move_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_move_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_movess_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, __U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_move_ss(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_move_ss(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movess_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), __U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_move_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_move_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_movesd_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, __U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_move_sd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_move_sd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_movesd_mask((__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), __U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_store_ss(float *__P, __mmask8 __U, __m128 __A) { +__funline void _mm_mask_store_ss(float *__P, __mmask8 __U, __m128 __A) { __builtin_ia32_storess_mask(__P, (__v4sf)__A, (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_store_sd(double *__P, __mmask8 __U, __m128d __A) { +__funline void _mm_mask_store_sd(double *__P, __mmask8 __U, __m128d __A) { __builtin_ia32_storesd_mask(__P, (__v2df)__A, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_loadu_epi64(__m512i __W, __mmask8 __U, void const *__P) { +__funline __m512i _mm512_mask_loadu_epi64(__m512i __W, __mmask8 __U, + void const *__P) { return (__m512i)__builtin_ia32_loaddqudi512_mask((const long long *)__P, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) { +__funline __m512i _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) { return (__m512i)__builtin_ia32_loaddqudi512_mask( (const long long *)__P, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) { +__funline void _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) { __builtin_ia32_storedqudi512_mask((long long *)__P, (__v8di)__A, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_loadu_si512(void const *__P) { +__funline __m512i _mm512_loadu_si512(void const *__P) { return *(__m512i_u *)__P; } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_loadu_epi32(__m512i __W, __mmask16 __U, void const *__P) { +__funline __m512i _mm512_mask_loadu_epi32(__m512i __W, __mmask16 __U, + void const *__P) { return (__m512i)__builtin_ia32_loaddqusi512_mask( (const int *)__P, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) { +__funline __m512i _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) { return (__m512i)__builtin_ia32_loaddqusi512_mask( (const int *)__P, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_storeu_si512(void *__P, __m512i __A) { +__funline void _mm512_storeu_si512(void *__P, __m512i __A) { *(__m512i_u *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) { +__funline void _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) { __builtin_ia32_storedqusi512_mask((int *)__P, (__v16si)__A, (__mmask16)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutevar_pd(__m512d __A, __m512i __C) { +__funline __m512d _mm512_permutevar_pd(__m512d __A, __m512i __C) { return (__m512d)__builtin_ia32_vpermilvarpd512_mask( (__v8df)__A, (__v8di)__C, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) { +__funline __m512d _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, + __m512d __A, __m512i __C) { return (__m512d)__builtin_ia32_vpermilvarpd512_mask( (__v8df)__A, (__v8di)__C, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) { +__funline __m512d _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, + __m512i __C) { return (__m512d)__builtin_ia32_vpermilvarpd512_mask( (__v8df)__A, (__v8di)__C, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutevar_ps(__m512 __A, __m512i __C) { +__funline __m512 _mm512_permutevar_ps(__m512 __A, __m512i __C) { return (__m512)__builtin_ia32_vpermilvarps512_mask( (__v16sf)__A, (__v16si)__C, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) { +__funline __m512 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512i __C) { return (__m512)__builtin_ia32_vpermilvarps512_mask( (__v16sf)__A, (__v16si)__C, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) { +__funline __m512 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, + __m512i __C) { return (__m512)__builtin_ia32_vpermilvarps512_mask( (__v16sf)__A, (__v16si)__C, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) { +__funline __m512i _mm512_permutex2var_epi64(__m512i __A, __m512i __I, + __m512i __B) { return (__m512i)__builtin_ia32_vpermt2varq512_mask((__v8di)__I /* idx */, (__v8di)__A, (__v8di)__B, (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, - __m512i __B) { +__funline __m512i _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, + __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermt2varq512_mask((__v8di)__I /* idx */, (__v8di)__A, (__v8di)__B, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, - __m512i __B) { +__funline __m512i _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, + __mmask8 __U, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varq512_mask((__v8di)__A, (__v8di)__I /* idx */, @@ -5207,39 +4311,32 @@ extern __inline __m512i (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, - __m512i __B) { +__funline __m512i _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, + __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermt2varq512_maskz((__v8di)__I /* idx */, (__v8di)__A, (__v8di)__B, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) { +__funline __m512i _mm512_permutex2var_epi32(__m512i __A, __m512i __I, + __m512i __B) { return (__m512i)__builtin_ia32_vpermt2vard512_mask((__v16si)__I /* idx */, (__v16si)__A, (__v16si)__B, (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, - __m512i __B) { +__funline __m512i _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, + __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermt2vard512_mask((__v16si)__I /* idx */, (__v16si)__A, (__v16si)__B, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, - __m512i __B) { +__funline __m512i _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, + __mmask16 __U, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2vard512_mask((__v16si)__A, (__v16si)__I /* idx */, @@ -5247,39 +4344,31 @@ extern __inline __m512i (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, - __m512i __B) { +__funline __m512i _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, + __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermt2vard512_maskz( (__v16si)__I /* idx */, (__v16si)__A, (__v16si)__B, (__mmask16)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) { +__funline __m512d _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) { return (__m512d)__builtin_ia32_vpermt2varpd512_mask((__v8di)__I /* idx */, (__v8df)__A, (__v8df)__B, (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, - __m512d __B) { +__funline __m512d _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, + __m512i __I, __m512d __B) { return (__m512d)__builtin_ia32_vpermt2varpd512_mask((__v8di)__I /* idx */, (__v8df)__A, (__v8df)__B, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, - __m512d __B) { +__funline __m512d _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, + __mmask8 __U, __m512d __B) { return (__m512d)__builtin_ia32_vpermi2varpd512_mask((__v8df)__A, (__v8di)__I /* idx */, @@ -5287,39 +4376,31 @@ extern __inline __m512d (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, - __m512d __B) { +__funline __m512d _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, + __m512i __I, __m512d __B) { return (__m512d)__builtin_ia32_vpermt2varpd512_maskz((__v8di)__I /* idx */, (__v8df)__A, (__v8df)__B, (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) { +__funline __m512 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) { return (__m512)__builtin_ia32_vpermt2varps512_mask((__v16si)__I /* idx */, (__v16sf)__A, (__v16sf)__B, (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, - __m512 __B) { +__funline __m512 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, + __m512i __I, __m512 __B) { return (__m512)__builtin_ia32_vpermt2varps512_mask((__v16si)__I /* idx */, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, - __m512 __B) { +__funline __m512 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, + __mmask16 __U, __m512 __B) { return (__m512)__builtin_ia32_vpermi2varps512_mask((__v16sf)__A, (__v16si)__I /* idx */, @@ -5327,10 +4408,8 @@ extern __inline __m512 (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, - __m512 __B) { +__funline __m512 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, + __m512i __I, __m512 __B) { return (__m512)__builtin_ia32_vpermt2varps512_maskz( (__v16si)__I /* idx */, @@ -5338,44 +4417,36 @@ extern __inline __m512 } #ifdef __OPTIMIZE__ -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permute_pd(__m512d __X, const int __C) { +__funline __m512d _mm512_permute_pd(__m512d __X, const int __C) { return (__m512d)__builtin_ia32_vpermilpd512_mask( (__v8df)__X, __C, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X, const int __C) { +__funline __m512d _mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X, + const int __C) { return (__m512d)__builtin_ia32_vpermilpd512_mask((__v8df)__X, __C, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permute_pd(__mmask8 __U, __m512d __X, const int __C) { +__funline __m512d _mm512_maskz_permute_pd(__mmask8 __U, __m512d __X, + const int __C) { return (__m512d)__builtin_ia32_vpermilpd512_mask( (__v8df)__X, __C, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permute_ps(__m512 __X, const int __C) { +__funline __m512 _mm512_permute_ps(__m512 __X, const int __C) { return (__m512)__builtin_ia32_vpermilps512_mask( (__v16sf)__X, __C, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X, const int __C) { +__funline __m512 _mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X, + const int __C) { return (__m512)__builtin_ia32_vpermilps512_mask((__v16sf)__X, __C, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permute_ps(__mmask16 __U, __m512 __X, const int __C) { +__funline __m512 _mm512_maskz_permute_ps(__mmask16 __U, __m512 __X, + const int __C) { return (__m512)__builtin_ia32_vpermilps512_mask( (__v16sf)__X, __C, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } @@ -5410,45 +4481,36 @@ extern __inline __m512 #endif #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutex_epi64(__m512i __X, const int __I) { +__funline __m512i _mm512_permutex_epi64(__m512i __X, const int __I) { return (__m512i)__builtin_ia32_permdi512_mask( (__v8di)__X, __I, (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1)); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutex_epi64(__m512i __W, __mmask8 __M, __m512i __X, - const int __I) { +__funline __m512i _mm512_mask_permutex_epi64(__m512i __W, __mmask8 __M, + __m512i __X, const int __I) { return (__m512i)__builtin_ia32_permdi512_mask((__v8di)__X, __I, (__v8di)__W, (__mmask8)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutex_epi64(__mmask8 __M, __m512i __X, const int __I) { +__funline __m512i _mm512_maskz_permutex_epi64(__mmask8 __M, __m512i __X, + const int __I) { return (__m512i)__builtin_ia32_permdi512_mask( (__v8di)__X, __I, (__v8di)_mm512_setzero_si512(), (__mmask8)__M); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutex_pd(__m512d __X, const int __M) { +__funline __m512d _mm512_permutex_pd(__m512d __X, const int __M) { return (__m512d)__builtin_ia32_permdf512_mask( (__v8df)__X, __M, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_permutex_pd(__m512d __W, __mmask8 __U, __m512d __X, const int __M) { +__funline __m512d _mm512_mask_permutex_pd(__m512d __W, __mmask8 __U, __m512d __X, + const int __M) { return (__m512d)__builtin_ia32_permdf512_mask((__v8df)__X, __M, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutex_pd(__mmask8 __U, __m512d __X, const int __M) { +__funline __m512d _mm512_maskz_permutex_pd(__mmask8 __U, __m512d __X, + const int __M) { return (__m512d)__builtin_ia32_permdf512_mask( (__v8df)__X, __M, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } @@ -5482,240 +4544,191 @@ extern __inline __m512d (__v8di)(__m512i)(X), (int)(I), (__v8di)(__m512i)(W), (__mmask8)(M))) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_permvardi512_mask( (__v8di)__Y, (__v8di)__X, (__v8di)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutexvar_epi64(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_permutexvar_epi64(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_permvardi512_mask( (__v8di)__Y, (__v8di)__X, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) { +__funline __m512i _mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, + __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_permvardi512_mask((__v8di)__Y, (__v8di)__X, (__v8di)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_permvarsi512_mask( (__v16si)__Y, (__v16si)__X, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutexvar_epi32(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_permutexvar_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_permvarsi512_mask( (__v16si)__Y, (__v16si)__X, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, __m512i __X, - __m512i __Y) { +__funline __m512i _mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, + __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_permvarsi512_mask((__v16si)__Y, (__v16si)__X, (__v16si)__W, __M); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutexvar_pd(__m512i __X, __m512d __Y) { +__funline __m512d _mm512_permutexvar_pd(__m512i __X, __m512d __Y) { return (__m512d)__builtin_ia32_permvardf512_mask( (__v8df)__Y, (__v8di)__X, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, __m512i __X, - __m512d __Y) { +__funline __m512d _mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, + __m512i __X, __m512d __Y) { return (__m512d)__builtin_ia32_permvardf512_mask((__v8df)__Y, (__v8di)__X, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, __m512d __Y) { +__funline __m512d _mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, + __m512d __Y) { return (__m512d)__builtin_ia32_permvardf512_mask( (__v8df)__Y, (__v8di)__X, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutexvar_ps(__m512i __X, __m512 __Y) { +__funline __m512 _mm512_permutexvar_ps(__m512i __X, __m512 __Y) { return (__m512)__builtin_ia32_permvarsf512_mask( (__v16sf)__Y, (__v16si)__X, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) { +__funline __m512 _mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, + __m512i __X, __m512 __Y) { return (__m512)__builtin_ia32_permvarsf512_mask((__v16sf)__Y, (__v16si)__X, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, __m512 __Y) { +__funline __m512 _mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, + __m512 __Y) { return (__m512)__builtin_ia32_permvarsf512_mask( (__v16sf)__Y, (__v16si)__X, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shuffle_ps(__m512 __M, __m512 __V, const int __imm) { +__funline __m512 _mm512_shuffle_ps(__m512 __M, __m512 __V, const int __imm) { return (__m512)__builtin_ia32_shufps512_mask( (__v16sf)__M, (__v16sf)__V, __imm, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shuffle_ps(__m512 __W, __mmask16 __U, __m512 __M, __m512 __V, - const int __imm) { +__funline __m512 _mm512_mask_shuffle_ps(__m512 __W, __mmask16 __U, __m512 __M, + __m512 __V, const int __imm) { return (__m512)__builtin_ia32_shufps512_mask( (__v16sf)__M, (__v16sf)__V, __imm, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V, - const int __imm) { +__funline __m512 _mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V, + const int __imm) { return (__m512)__builtin_ia32_shufps512_mask( (__v16sf)__M, (__v16sf)__V, __imm, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shuffle_pd(__m512d __M, __m512d __V, const int __imm) { +__funline __m512d _mm512_shuffle_pd(__m512d __M, __m512d __V, const int __imm) { return (__m512d)__builtin_ia32_shufpd512_mask((__v8df)__M, (__v8df)__V, __imm, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shuffle_pd(__m512d __W, __mmask8 __U, __m512d __M, __m512d __V, - const int __imm) { +__funline __m512d _mm512_mask_shuffle_pd(__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __imm) { return (__m512d)__builtin_ia32_shufpd512_mask((__v8df)__M, (__v8df)__V, __imm, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shuffle_pd(__mmask8 __U, __m512d __M, __m512d __V, - const int __imm) { +__funline __m512d _mm512_maskz_shuffle_pd(__mmask8 __U, __m512d __M, __m512d __V, + const int __imm) { return (__m512d)__builtin_ia32_shufpd512_mask((__v8df)__M, (__v8df)__V, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fixupimm_round_pd(__m512d __A, __m512d __B, __m512i __C, - const int __imm, const int __R) { +__funline __m512d _mm512_fixupimm_round_pd(__m512d __A, __m512d __B, __m512i __C, + const int __imm, const int __R) { return (__m512d)__builtin_ia32_fixupimmpd512_mask( (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fixupimm_round_pd(__m512d __A, __mmask8 __U, __m512d __B, - __m512i __C, const int __imm, const int __R) { +__funline __m512d _mm512_mask_fixupimm_round_pd(__m512d __A, __mmask8 __U, + __m512d __B, __m512i __C, + const int __imm, const int __R) { return (__m512d)__builtin_ia32_fixupimmpd512_mask( (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fixupimm_round_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512i __C, const int __imm, - const int __R) { +__funline __m512d _mm512_maskz_fixupimm_round_pd(__mmask8 __U, __m512d __A, + __m512d __B, __m512i __C, + const int __imm, const int __R) { return (__m512d)__builtin_ia32_fixupimmpd512_maskz( (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fixupimm_round_ps(__m512 __A, __m512 __B, __m512i __C, - const int __imm, const int __R) { +__funline __m512 _mm512_fixupimm_round_ps(__m512 __A, __m512 __B, __m512i __C, + const int __imm, const int __R) { return (__m512)__builtin_ia32_fixupimmps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fixupimm_round_ps(__m512 __A, __mmask16 __U, __m512 __B, - __m512i __C, const int __imm, const int __R) { +__funline __m512 _mm512_mask_fixupimm_round_ps(__m512 __A, __mmask16 __U, + __m512 __B, __m512i __C, + const int __imm, const int __R) { return (__m512)__builtin_ia32_fixupimmps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fixupimm_round_ps(__mmask16 __U, __m512 __A, __m512 __B, - __m512i __C, const int __imm, - const int __R) { +__funline __m512 _mm512_maskz_fixupimm_round_ps(__mmask16 __U, __m512 __A, + __m512 __B, __m512i __C, + const int __imm, const int __R) { return (__m512)__builtin_ia32_fixupimmps512_maskz( (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fixupimm_round_sd(__m128d __A, __m128d __B, __m128i __C, - const int __imm, const int __R) { +__funline __m128d _mm_fixupimm_round_sd(__m128d __A, __m128d __B, __m128i __C, + const int __imm, const int __R) { return (__m128d)__builtin_ia32_fixupimmsd_mask( (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)-1, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fixupimm_round_sd(__m128d __A, __mmask8 __U, __m128d __B, - __m128i __C, const int __imm, const int __R) { +__funline __m128d _mm_mask_fixupimm_round_sd(__m128d __A, __mmask8 __U, + __m128d __B, __m128i __C, + const int __imm, const int __R) { return (__m128d)__builtin_ia32_fixupimmsd_mask( (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fixupimm_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - __m128i __C, const int __imm, const int __R) { +__funline __m128d _mm_maskz_fixupimm_round_sd(__mmask8 __U, __m128d __A, + __m128d __B, __m128i __C, + const int __imm, const int __R) { return (__m128d)__builtin_ia32_fixupimmsd_maskz( (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fixupimm_round_ss(__m128 __A, __m128 __B, __m128i __C, const int __imm, - const int __R) { +__funline __m128 _mm_fixupimm_round_ss(__m128 __A, __m128 __B, __m128i __C, + const int __imm, const int __R) { return (__m128)__builtin_ia32_fixupimmss_mask( (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)-1, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fixupimm_round_ss(__m128 __A, __mmask8 __U, __m128 __B, - __m128i __C, const int __imm, const int __R) { +__funline __m128 _mm_mask_fixupimm_round_ss(__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm, + const int __R) { return (__m128)__builtin_ia32_fixupimmss_mask( (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fixupimm_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - __m128i __C, const int __imm, const int __R) { +__funline __m128 _mm_maskz_fixupimm_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm, + const int __R) { return (__m128)__builtin_ia32_fixupimmss_maskz( (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, __R); } @@ -5812,223 +4825,170 @@ extern __inline __m128 (int)(C), (__mmask8)(U), (R))) #endif -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_movehdup_ps(__m512 __A) { +__funline __m512 _mm512_movehdup_ps(__m512 __A) { return (__m512)__builtin_ia32_movshdup512_mask( (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_movshdup512_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) { +__funline __m512 _mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_movshdup512_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_moveldup_ps(__m512 __A) { +__funline __m512 _mm512_moveldup_ps(__m512 __A) { return (__m512)__builtin_ia32_movsldup512_mask( (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_movsldup512_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) { +__funline __m512 _mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_movsldup512_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_or_si512(__m512i __A, __m512i __B) { +__funline __m512i _mm512_or_si512(__m512i __A, __m512i __B) { return (__m512i)((__v16su)__A | (__v16su)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_or_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_or_epi32(__m512i __A, __m512i __B) { return (__m512i)((__v16su)__A | (__v16su)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_or_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_or_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pord512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_or_epi32(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_or_epi32(__mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pord512_mask((__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_or_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_or_epi64(__m512i __A, __m512i __B) { return (__m512i)((__v8du)__A | (__v8du)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_or_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_or_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_porq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_or_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_or_epi64(__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_porq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_xor_si512(__m512i __A, __m512i __B) { +__funline __m512i _mm512_xor_si512(__m512i __A, __m512i __B) { return (__m512i)((__v16su)__A ^ (__v16su)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_xor_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_xor_epi32(__m512i __A, __m512i __B) { return (__m512i)((__v16su)__A ^ (__v16su)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_xor_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_xor_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pxord512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_xor_epi32(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_xor_epi32(__mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pxord512_mask((__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_xor_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_xor_epi64(__m512i __A, __m512i __B) { return (__m512i)((__v8du)__A ^ (__v8du)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_xor_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_xor_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pxorq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_xor_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_xor_epi64(__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pxorq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rol_epi32(__m512i __A, const int __B) { +__funline __m512i _mm512_rol_epi32(__m512i __A, const int __B) { return (__m512i)__builtin_ia32_prold512_mask( (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_rol_epi32(__m512i __W, __mmask16 __U, __m512i __A, const int __B) { +__funline __m512i _mm512_mask_rol_epi32(__m512i __W, __mmask16 __U, __m512i __A, + const int __B) { return (__m512i)__builtin_ia32_prold512_mask((__v16si)__A, __B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rol_epi32(__mmask16 __U, __m512i __A, const int __B) { +__funline __m512i _mm512_maskz_rol_epi32(__mmask16 __U, __m512i __A, + const int __B) { return (__m512i)__builtin_ia32_prold512_mask( (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_ror_epi32(__m512i __A, int __B) { +__funline __m512i _mm512_ror_epi32(__m512i __A, int __B) { return (__m512i)__builtin_ia32_prord512_mask( (__v16si)__A, __B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_ror_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) { +__funline __m512i _mm512_mask_ror_epi32(__m512i __W, __mmask16 __U, __m512i __A, + int __B) { return (__m512i)__builtin_ia32_prord512_mask((__v16si)__A, __B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_ror_epi32(__mmask16 __U, __m512i __A, int __B) { +__funline __m512i _mm512_maskz_ror_epi32(__mmask16 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_prord512_mask( (__v16si)__A, __B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_rol_epi64(__m512i __A, const int __B) { +__funline __m512i _mm512_rol_epi64(__m512i __A, const int __B) { return (__m512i)__builtin_ia32_prolq512_mask( (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_rol_epi64(__m512i __W, __mmask8 __U, __m512i __A, const int __B) { +__funline __m512i _mm512_mask_rol_epi64(__m512i __W, __mmask8 __U, __m512i __A, + const int __B) { return (__m512i)__builtin_ia32_prolq512_mask((__v8di)__A, __B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_rol_epi64(__mmask8 __U, __m512i __A, const int __B) { +__funline __m512i _mm512_maskz_rol_epi64(__mmask8 __U, __m512i __A, + const int __B) { return (__m512i)__builtin_ia32_prolq512_mask( (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_ror_epi64(__m512i __A, int __B) { +__funline __m512i _mm512_ror_epi64(__m512i __A, int __B) { return (__m512i)__builtin_ia32_prorq512_mask( (__v8di)__A, __B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_ror_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) { +__funline __m512i _mm512_mask_ror_epi64(__m512i __W, __mmask8 __U, __m512i __A, + int __B) { return (__m512i)__builtin_ia32_prorq512_mask((__v8di)__A, __B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_ror_epi64(__mmask8 __U, __m512i __A, int __B) { +__funline __m512i _mm512_maskz_ror_epi64(__mmask8 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_prorq512_mask( (__v8di)__A, __B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } @@ -6081,311 +5041,238 @@ extern __inline __m512i (__mmask8)(U))) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_and_si512(__m512i __A, __m512i __B) { +__funline __m512i _mm512_and_si512(__m512i __A, __m512i __B) { return (__m512i)((__v16su)__A & (__v16su)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_and_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_and_epi32(__m512i __A, __m512i __B) { return (__m512i)((__v16su)__A & (__v16su)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_and_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_and_epi32(__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pandd512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_and_epi32(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_and_epi32(__mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pandd512_mask((__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_and_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_and_epi64(__m512i __A, __m512i __B) { return (__m512i)((__v8du)__A & (__v8du)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_and_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_and_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pandq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, __U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_and_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_and_epi64(__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pandq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_pd(), __U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_andnot_si512(__m512i __A, __m512i __B) { +__funline __m512i _mm512_andnot_si512(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pandnd512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_andnot_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_andnot_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pandnd512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pandnd512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pandnd512_mask((__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_andnot_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_andnot_epi64(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pandnq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pandnq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, __U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pandnq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_pd(), __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_test_epi32_mask(__m512i __A, __m512i __B) { +__funline __mmask16 _mm512_test_epi32_mask(__m512i __A, __m512i __B) { return (__mmask16)__builtin_ia32_ptestmd512((__v16si)__A, (__v16si)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_test_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __mmask16 _mm512_mask_test_epi32_mask(__mmask16 __U, __m512i __A, + __m512i __B) { return (__mmask16)__builtin_ia32_ptestmd512((__v16si)__A, (__v16si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_test_epi64_mask(__m512i __A, __m512i __B) { +__funline __mmask8 _mm512_test_epi64_mask(__m512i __A, __m512i __B) { return (__mmask8)__builtin_ia32_ptestmq512((__v8di)__A, (__v8di)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_test_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __mmask8 _mm512_mask_test_epi64_mask(__mmask8 __U, __m512i __A, + __m512i __B) { return (__mmask8)__builtin_ia32_ptestmq512((__v8di)__A, (__v8di)__B, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_testn_epi32_mask(__m512i __A, __m512i __B) { +__funline __mmask16 _mm512_testn_epi32_mask(__m512i __A, __m512i __B) { return (__mmask16)__builtin_ia32_ptestnmd512((__v16si)__A, (__v16si)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_testn_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __mmask16 _mm512_mask_testn_epi32_mask(__mmask16 __U, __m512i __A, + __m512i __B) { return (__mmask16)__builtin_ia32_ptestnmd512((__v16si)__A, (__v16si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_testn_epi64_mask(__m512i __A, __m512i __B) { +__funline __mmask8 _mm512_testn_epi64_mask(__m512i __A, __m512i __B) { return (__mmask8)__builtin_ia32_ptestnmq512((__v8di)__A, (__v8di)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_testn_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __mmask8 _mm512_mask_testn_epi64_mask(__mmask8 __U, __m512i __A, + __m512i __B) { return (__mmask8)__builtin_ia32_ptestnmq512((__v8di)__A, (__v8di)__B, __U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_abs_ps(__m512 __A) { +__funline __m512 _mm512_abs_ps(__m512 __A) { return (__m512)_mm512_and_epi32((__m512i)__A, _mm512_set1_epi32(0x7fffffff)); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_abs_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_abs_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)_mm512_mask_and_epi32((__m512i)__W, __U, (__m512i)__A, _mm512_set1_epi32(0x7fffffff)); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_abs_pd(__m512d __A) { +__funline __m512d _mm512_abs_pd(__m512d __A) { return (__m512d)_mm512_and_epi64((__m512i)__A, _mm512_set1_epi64(0x7fffffffffffffffLL)); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_abs_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_abs_pd(__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)_mm512_mask_and_epi64( (__m512i)__W, __U, (__m512i)__A, _mm512_set1_epi64(0x7fffffffffffffffLL)); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpackhi_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_unpackhi_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpckhdq512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { +__funline __m512i _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpckhdq512_mask( (__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_punpckhdq512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpackhi_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_unpackhi_epi64(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpckhqdq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { +__funline __m512i _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpckhqdq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_punpckhqdq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpacklo_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_unpacklo_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpckldq512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { +__funline __m512i _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpckldq512_mask( (__v16si)__A, (__v16si)__B, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_punpckldq512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpacklo_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_unpacklo_epi64(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpcklqdq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_unpacklo_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B) { +__funline __m512i _mm512_mask_unpacklo_epi64(__m512i __W, __mmask8 __U, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_punpcklqdq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpacklo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_unpacklo_epi64(__mmask8 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_punpcklqdq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } #ifdef __x86_64__ #ifdef __OPTIMIZE__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundss_u64(__m128 __A, const int __R) { +__funline unsigned long long _mm_cvt_roundss_u64(__m128 __A, const int __R) { return (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)__A, __R); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundss_si64(__m128 __A, const int __R) { +__funline long long _mm_cvt_roundss_si64(__m128 __A, const int __R) { return (long long)__builtin_ia32_vcvtss2si64((__v4sf)__A, __R); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundss_i64(__m128 __A, const int __R) { +__funline long long _mm_cvt_roundss_i64(__m128 __A, const int __R) { return (long long)__builtin_ia32_vcvtss2si64((__v4sf)__A, __R); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundss_u64(__m128 __A, const int __R) { +__funline unsigned long long _mm_cvtt_roundss_u64(__m128 __A, const int __R) { return (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)__A, __R); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundss_i64(__m128 __A, const int __R) { +__funline long long _mm_cvtt_roundss_i64(__m128 __A, const int __R) { return (long long)__builtin_ia32_vcvttss2si64((__v4sf)__A, __R); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundss_si64(__m128 __A, const int __R) { +__funline long long _mm_cvtt_roundss_si64(__m128 __A, const int __R) { return (long long)__builtin_ia32_vcvttss2si64((__v4sf)__A, __R); } #else @@ -6408,39 +5295,27 @@ extern __inline long long #endif #ifdef __OPTIMIZE__ -extern __inline unsigned - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundss_u32(__m128 __A, const int __R) { +__funline unsigned _mm_cvt_roundss_u32(__m128 __A, const int __R) { return (unsigned)__builtin_ia32_vcvtss2usi32((__v4sf)__A, __R); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundss_si32(__m128 __A, const int __R) { +__funline int _mm_cvt_roundss_si32(__m128 __A, const int __R) { return (int)__builtin_ia32_vcvtss2si32((__v4sf)__A, __R); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundss_i32(__m128 __A, const int __R) { +__funline int _mm_cvt_roundss_i32(__m128 __A, const int __R) { return (int)__builtin_ia32_vcvtss2si32((__v4sf)__A, __R); } -extern __inline unsigned - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundss_u32(__m128 __A, const int __R) { +__funline unsigned _mm_cvtt_roundss_u32(__m128 __A, const int __R) { return (unsigned)__builtin_ia32_vcvttss2usi32((__v4sf)__A, __R); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundss_i32(__m128 __A, const int __R) { +__funline int _mm_cvtt_roundss_i32(__m128 __A, const int __R) { return (int)__builtin_ia32_vcvttss2si32((__v4sf)__A, __R); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundss_si32(__m128 __A, const int __R) { +__funline int _mm_cvtt_roundss_si32(__m128 __A, const int __R) { return (int)__builtin_ia32_vcvttss2si32((__v4sf)__A, __R); } #else @@ -6460,39 +5335,27 @@ extern __inline int #ifdef __x86_64__ #ifdef __OPTIMIZE__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundsd_u64(__m128d __A, const int __R) { +__funline unsigned long long _mm_cvt_roundsd_u64(__m128d __A, const int __R) { return (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)__A, __R); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundsd_si64(__m128d __A, const int __R) { +__funline long long _mm_cvt_roundsd_si64(__m128d __A, const int __R) { return (long long)__builtin_ia32_vcvtsd2si64((__v2df)__A, __R); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundsd_i64(__m128d __A, const int __R) { +__funline long long _mm_cvt_roundsd_i64(__m128d __A, const int __R) { return (long long)__builtin_ia32_vcvtsd2si64((__v2df)__A, __R); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundsd_u64(__m128d __A, const int __R) { +__funline unsigned long long _mm_cvtt_roundsd_u64(__m128d __A, const int __R) { return (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)__A, __R); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundsd_si64(__m128d __A, const int __R) { +__funline long long _mm_cvtt_roundsd_si64(__m128d __A, const int __R) { return (long long)__builtin_ia32_vcvttsd2si64((__v2df)__A, __R); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundsd_i64(__m128d __A, const int __R) { +__funline long long _mm_cvtt_roundsd_i64(__m128d __A, const int __R) { return (long long)__builtin_ia32_vcvttsd2si64((__v2df)__A, __R); } #else @@ -6515,39 +5378,27 @@ extern __inline long long #endif #ifdef __OPTIMIZE__ -extern __inline unsigned - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundsd_u32(__m128d __A, const int __R) { +__funline unsigned _mm_cvt_roundsd_u32(__m128d __A, const int __R) { return (unsigned)__builtin_ia32_vcvtsd2usi32((__v2df)__A, __R); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundsd_si32(__m128d __A, const int __R) { +__funline int _mm_cvt_roundsd_si32(__m128d __A, const int __R) { return (int)__builtin_ia32_vcvtsd2si32((__v2df)__A, __R); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundsd_i32(__m128d __A, const int __R) { +__funline int _mm_cvt_roundsd_i32(__m128d __A, const int __R) { return (int)__builtin_ia32_vcvtsd2si32((__v2df)__A, __R); } -extern __inline unsigned - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundsd_u32(__m128d __A, const int __R) { +__funline unsigned _mm_cvtt_roundsd_u32(__m128d __A, const int __R) { return (unsigned)__builtin_ia32_vcvttsd2usi32((__v2df)__A, __R); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundsd_i32(__m128d __A, const int __R) { +__funline int _mm_cvtt_roundsd_i32(__m128d __A, const int __R) { return (int)__builtin_ia32_vcvttsd2si32((__v2df)__A, __R); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_roundsd_si32(__m128d __A, const int __R) { +__funline int _mm_cvtt_roundsd_si32(__m128d __A, const int __R) { return (int)__builtin_ia32_vcvttsd2si32((__v2df)__A, __R); } #else @@ -6565,175 +5416,137 @@ extern __inline int #define _mm_cvtt_roundsd_i32(A, B) ((int)__builtin_ia32_vcvttsd2si32(A, B)) #endif -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_movedup_pd(__m512d __A) { +__funline __m512d _mm512_movedup_pd(__m512d __A) { return (__m512d)__builtin_ia32_movddup512_mask( (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_movedup_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_movedup_pd(__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_movddup512_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_movedup_pd(__mmask8 __U, __m512d __A) { +__funline __m512d _mm512_maskz_movedup_pd(__mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_movddup512_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpacklo_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_unpacklo_pd(__m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_unpcklpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_unpcklpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpacklo_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_unpacklo_pd(__mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_unpcklpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpackhi_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_unpackhi_pd(__m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_unpckhpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_unpckhpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_unpckhpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpackhi_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_unpackhi_ps(__m512 __A, __m512 __B) { return (__m512)__builtin_ia32_unpckhps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_unpckhps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpackhi_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_unpackhi_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_unpckhps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } #ifdef __OPTIMIZE__ -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundps_pd(__m256 __A, const int __R) { +__funline __m512d _mm512_cvt_roundps_pd(__m256 __A, const int __R) { return (__m512d)__builtin_ia32_cvtps2pd512_mask( (__v8sf)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundps_pd(__m512d __W, __mmask8 __U, __m256 __A, - const int __R) { +__funline __m512d _mm512_mask_cvt_roundps_pd(__m512d __W, __mmask8 __U, + __m256 __A, const int __R) { return (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)__A, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundps_pd(__mmask8 __U, __m256 __A, const int __R) { +__funline __m512d _mm512_maskz_cvt_roundps_pd(__mmask8 __U, __m256 __A, + const int __R) { return (__m512d)__builtin_ia32_cvtps2pd512_mask( (__v8sf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundph_ps(__m256i __A, const int __R) { +__funline __m512 _mm512_cvt_roundph_ps(__m256i __A, const int __R) { return (__m512)__builtin_ia32_vcvtph2ps512_mask( (__v16hi)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundph_ps(__m512 __W, __mmask16 __U, __m256i __A, - const int __R) { +__funline __m512 _mm512_mask_cvt_roundph_ps(__m512 __W, __mmask16 __U, + __m256i __A, const int __R) { return (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)__A, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundph_ps(__mmask16 __U, __m256i __A, const int __R) { +__funline __m512 _mm512_maskz_cvt_roundph_ps(__mmask16 __U, __m256i __A, + const int __R) { return (__m512)__builtin_ia32_vcvtph2ps512_mask( (__v16hi)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundps_ph(__m512 __A, const int __I) { +__funline __m256i _mm512_cvt_roundps_ph(__m512 __A, const int __I) { return (__m256i)__builtin_ia32_vcvtps2ph512_mask( (__v16sf)__A, __I, (__v16hi)_mm256_undefined_si256(), -1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtps_ph(__m512 __A, const int __I) { +__funline __m256i _mm512_cvtps_ph(__m512 __A, const int __I) { return (__m256i)__builtin_ia32_vcvtps2ph512_mask( (__v16sf)__A, __I, (__v16hi)_mm256_undefined_si256(), -1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundps_ph(__m256i __U, __mmask16 __W, __m512 __A, - const int __I) { +__funline __m256i _mm512_mask_cvt_roundps_ph(__m256i __U, __mmask16 __W, + __m512 __A, const int __I) { return (__m256i)__builtin_ia32_vcvtps2ph512_mask( (__v16sf)__A, __I, (__v16hi)__U, (__mmask16)__W); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_cvtps_ph(__m256i __U, __mmask16 __W, __m512 __A, const int __I) { +__funline __m256i _mm512_mask_cvtps_ph(__m256i __U, __mmask16 __W, __m512 __A, + const int __I) { return (__m256i)__builtin_ia32_vcvtps2ph512_mask( (__v16sf)__A, __I, (__v16hi)__U, (__mmask16)__W); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundps_ph(__mmask16 __W, __m512 __A, const int __I) { +__funline __m256i _mm512_maskz_cvt_roundps_ph(__mmask16 __W, __m512 __A, + const int __I) { return (__m256i)__builtin_ia32_vcvtps2ph512_mask( (__v16sf)__A, __I, (__v16hi)_mm256_setzero_si256(), (__mmask16)__W); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtps_ph(__mmask16 __W, __m512 __A, const int __I) { +__funline __m256i _mm512_maskz_cvtps_ph(__mmask16 __W, __m512 __A, + const int __I) { return (__m256i)__builtin_ia32_vcvtps2ph512_mask( (__v16sf)__A, __I, (__v16hi)_mm256_setzero_si256(), (__mmask16)__W); } @@ -6783,37 +5596,28 @@ extern __inline __m256i #endif #ifdef __OPTIMIZE__ -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvt_roundpd_ps(__m512d __A, const int __R) { +__funline __m256 _mm512_cvt_roundpd_ps(__m512d __A, const int __R) { return (__m256)__builtin_ia32_cvtpd2ps512_mask( (__v8df)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, __R); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvt_roundpd_ps(__m256 __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m256 _mm512_mask_cvt_roundpd_ps(__m256 __W, __mmask8 __U, __m512d __A, + const int __R) { return (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)__A, (__v8sf)__W, (__mmask8)__U, __R); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvt_roundpd_ps(__mmask8 __U, __m512d __A, const int __R) { +__funline __m256 _mm512_maskz_cvt_roundpd_ps(__mmask8 __U, __m512d __A, + const int __R) { return (__m256)__builtin_ia32_cvtpd2ps512_mask( (__v8df)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundsd_ss(__m128 __A, __m128d __B, const int __R) { +__funline __m128 _mm_cvt_roundsd_ss(__m128 __A, __m128d __B, const int __R) { return (__m128)__builtin_ia32_cvtsd2ss_round((__v4sf)__A, (__v2df)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_roundss_sd(__m128d __A, __m128 __B, const int __R) { +__funline __m128d _mm_cvt_roundss_sd(__m128d __A, __m128 __B, const int __R) { return (__m128d)__builtin_ia32_cvtss2sd_round((__v2df)__A, (__v4sf)__B, __R); } #else @@ -6834,27 +5638,19 @@ extern __inline __m128d (__m128d) __builtin_ia32_cvtss2sd_round(A, B, C) #endif -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_stream_si512(__m512i *__P, __m512i __A) { +__funline void _mm512_stream_si512(__m512i *__P, __m512i __A) { __builtin_ia32_movntdq512((__v8di *)__P, (__v8di)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_stream_ps(float *__P, __m512 __A) { +__funline void _mm512_stream_ps(float *__P, __m512 __A) { __builtin_ia32_movntps512(__P, (__v16sf)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_stream_pd(double *__P, __m512d __A) { +__funline void _mm512_stream_pd(double *__P, __m512d __A) { __builtin_ia32_movntpd512(__P, (__v8df)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_stream_load_si512(void *__P) { +__funline __m512i _mm512_stream_load_si512(void *__P) { return __builtin_ia32_movntdqa512((__v8di *)__P); } @@ -6872,197 +5668,169 @@ typedef enum { } _MM_MANTISSA_SIGN_ENUM; #ifdef __OPTIMIZE__ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getexp_round_ss(__m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_getexp_round_ss(__m128 __A, __m128 __B, const int __R) { return (__m128)__builtin_ia32_getexpss128_round((__v4sf)__A, (__v4sf)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getexp_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_getexp_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_getexpss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_getexp_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_maskz_getexp_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_getexpss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getexp_round_sd(__m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_getexp_round_sd(__m128d __A, __m128d __B, const int __R) { return (__m128d)__builtin_ia32_getexpsd128_round((__v2df)__A, (__v2df)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getexp_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { +__funline __m128d _mm_mask_getexp_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_getexpsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getexp_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_maskz_getexp_round_sd(__mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_getexpsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_getexp_round_ps(__m512 __A, const int __R) { +__funline __m512 _mm512_getexp_round_ps(__m512 __A, const int __R) { return (__m512)__builtin_ia32_getexpps512_mask( (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_getexp_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - const int __R) { +__funline __m512 _mm512_mask_getexp_round_ps(__m512 __W, __mmask16 __U, + __m512 __A, const int __R) { return (__m512)__builtin_ia32_getexpps512_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_getexp_round_ps(__mmask16 __U, __m512 __A, const int __R) { +__funline __m512 _mm512_maskz_getexp_round_ps(__mmask16 __U, __m512 __A, + const int __R) { return (__m512)__builtin_ia32_getexpps512_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_getexp_round_pd(__m512d __A, const int __R) { +__funline __m512d _mm512_getexp_round_pd(__m512d __A, const int __R) { return (__m512d)__builtin_ia32_getexppd512_mask( (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_getexp_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - const int __R) { +__funline __m512d _mm512_mask_getexp_round_pd(__m512d __W, __mmask8 __U, + __m512d __A, const int __R) { return (__m512d)__builtin_ia32_getexppd512_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_getexp_round_pd(__mmask8 __U, __m512d __A, const int __R) { +__funline __m512d _mm512_maskz_getexp_round_pd(__mmask8 __U, __m512d __A, + const int __R) { return (__m512d)__builtin_ia32_getexppd512_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_getmant_round_pd(__m512d __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, const int __R) { +__funline __m512d _mm512_getmant_round_pd(__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, + const int __R) { return (__m512d)__builtin_ia32_getmantpd512_mask( (__v8df)__A, (__C << 2) | __B, _mm512_undefined_pd(), (__mmask8)-1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_getmant_round_pd(__m512d __W, __mmask8 __U, __m512d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, const int __R) { +__funline __m512d _mm512_mask_getmant_round_pd(__m512d __W, __mmask8 __U, + __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, + const int __R) { return (__m512d)__builtin_ia32_getmantpd512_mask( (__v8df)__A, (__C << 2) | __B, (__v8df)__W, __U, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_getmant_round_pd(__mmask8 __U, __m512d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, const int __R) { +__funline __m512d _mm512_maskz_getmant_round_pd(__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, + const int __R) { return (__m512d)__builtin_ia32_getmantpd512_mask( (__v8df)__A, (__C << 2) | __B, (__v8df)_mm512_setzero_pd(), __U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_getmant_round_ps(__m512 __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, const int __R) { +__funline __m512 _mm512_getmant_round_ps(__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, + const int __R) { return (__m512)__builtin_ia32_getmantps512_mask( (__v16sf)__A, (__C << 2) | __B, _mm512_undefined_ps(), (__mmask16)-1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_getmant_round_ps(__m512 __W, __mmask16 __U, __m512 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, const int __R) { +__funline __m512 _mm512_mask_getmant_round_ps(__m512 __W, __mmask16 __U, + __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, + const int __R) { return (__m512)__builtin_ia32_getmantps512_mask( (__v16sf)__A, (__C << 2) | __B, (__v16sf)__W, __U, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_getmant_round_ps(__mmask16 __U, __m512 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C, const int __R) { +__funline __m512 _mm512_maskz_getmant_round_ps(__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, + const int __R) { return (__m512)__builtin_ia32_getmantps512_mask( (__v16sf)__A, (__C << 2) | __B, (__v16sf)_mm512_setzero_ps(), __U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getmant_round_sd(__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) { +__funline __m128d _mm_getmant_round_sd(__m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, + const int __R) { return (__m128d)__builtin_ia32_getmantsd_round((__v2df)__A, (__v2df)__B, (__D << 2) | __C, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getmant_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) { +__funline __m128d _mm_mask_getmant_round_sd(__m128d __W, __mmask8 __U, + __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, + const int __R) { return (__m128d)__builtin_ia32_getmantsd_mask_round( (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)__W, __U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getmant_round_sd(__mmask8 __U, __m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) { +__funline __m128d _mm_maskz_getmant_round_sd(__mmask8 __U, __m128d __A, + __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, + const int __R) { return (__m128d)__builtin_ia32_getmantsd_mask_round( (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)_mm_setzero_pd(), __U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getmant_round_ss(__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) { +__funline __m128 _mm_getmant_round_ss(__m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) { return (__m128)__builtin_ia32_getmantss_round((__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getmant_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) { +__funline __m128 _mm_mask_getmant_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, + const int __R) { return (__m128)__builtin_ia32_getmantss_mask_round( (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)__W, __U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getmant_round_ss(__mmask8 __U, __m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D, const int __R) { +__funline __m128 _mm_maskz_getmant_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, + const int __R) { return (__m128)__builtin_ia32_getmantss_mask_round( (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)_mm_setzero_ps(), __U, __R); @@ -7174,64 +5942,54 @@ extern __inline __m128 #endif #ifdef __OPTIMIZE__ -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_roundscale_round_ps(__m512 __A, const int __imm, const int __R) { +__funline __m512 _mm512_roundscale_round_ps(__m512 __A, const int __imm, + const int __R) { return (__m512)__builtin_ia32_rndscaleps_mask( (__v16sf)__A, __imm, (__v16sf)_mm512_undefined_ps(), -1, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_roundscale_round_ps(__m512 __A, __mmask16 __B, __m512 __C, - const int __imm, const int __R) { +__funline __m512 _mm512_mask_roundscale_round_ps(__m512 __A, __mmask16 __B, + __m512 __C, const int __imm, + const int __R) { return (__m512)__builtin_ia32_rndscaleps_mask( (__v16sf)__C, __imm, (__v16sf)__A, (__mmask16)__B, __R); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_roundscale_round_ps(__mmask16 __A, __m512 __B, const int __imm, - const int __R) { +__funline __m512 _mm512_maskz_roundscale_round_ps(__mmask16 __A, __m512 __B, + const int __imm, + const int __R) { return (__m512)__builtin_ia32_rndscaleps_mask( (__v16sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), (__mmask16)__A, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_roundscale_round_pd(__m512d __A, const int __imm, const int __R) { +__funline __m512d _mm512_roundscale_round_pd(__m512d __A, const int __imm, + const int __R) { return (__m512d)__builtin_ia32_rndscalepd_mask( (__v8df)__A, __imm, (__v8df)_mm512_undefined_pd(), -1, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_roundscale_round_pd(__m512d __A, __mmask8 __B, __m512d __C, - const int __imm, const int __R) { +__funline __m512d _mm512_mask_roundscale_round_pd(__m512d __A, __mmask8 __B, + __m512d __C, const int __imm, + const int __R) { return (__m512d)__builtin_ia32_rndscalepd_mask( (__v8df)__C, __imm, (__v8df)__A, (__mmask8)__B, __R); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_roundscale_round_pd(__mmask8 __A, __m512d __B, const int __imm, - const int __R) { +__funline __m512d _mm512_maskz_roundscale_round_pd(__mmask8 __A, __m512d __B, + const int __imm, + const int __R) { return (__m512d)__builtin_ia32_rndscalepd_mask( (__v8df)__B, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)__A, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_roundscale_round_ss(__m128 __A, __m128 __B, const int __imm, - const int __R) { +__funline __m128 _mm_roundscale_round_ss(__m128 __A, __m128 __B, const int __imm, + const int __R) { return (__m128)__builtin_ia32_rndscaless_round((__v4sf)__A, (__v4sf)__B, __imm, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_roundscale_round_sd(__m128d __A, __m128d __B, const int __imm, - const int __R) { +__funline __m128d _mm_roundscale_round_sd(__m128d __A, __m128d __B, + const int __imm, const int __R) { return (__m128d)__builtin_ia32_rndscalesd_round((__v2df)__A, (__v2df)__B, __imm, __R); } @@ -7268,113 +6026,86 @@ extern __inline __m128d (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), R)) #endif -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_floor_ps(__m512 __A) { +__funline __m512 _mm512_floor_ps(__m512 __A) { return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_FLOOR, (__v16sf)__A, -1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_floor_pd(__m512d __A) { +__funline __m512d _mm512_floor_pd(__m512d __A) { return (__m512d)__builtin_ia32_rndscalepd_mask( (__v8df)__A, _MM_FROUND_FLOOR, (__v8df)__A, -1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_ceil_ps(__m512 __A) { +__funline __m512 _mm512_ceil_ps(__m512 __A) { return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_CEIL, (__v16sf)__A, -1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_ceil_pd(__m512d __A) { +__funline __m512d _mm512_ceil_pd(__m512d __A) { return (__m512d)__builtin_ia32_rndscalepd_mask( (__v8df)__A, _MM_FROUND_CEIL, (__v8df)__A, -1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_floor_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_floor_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_FLOOR, (__v16sf)__W, __U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_floor_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_floor_pd(__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)__A, _MM_FROUND_FLOOR, (__v8df)__W, __U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_ceil_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_ceil_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, _MM_FROUND_CEIL, (__v16sf)__W, __U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_ceil_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_ceil_pd(__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_rndscalepd_mask( (__v8df)__A, _MM_FROUND_CEIL, (__v8df)__W, __U, _MM_FROUND_CUR_DIRECTION); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_alignr_epi32(__m512i __A, __m512i __B, const int __imm) { +__funline __m512i _mm512_alignr_epi32(__m512i __A, __m512i __B, const int __imm) { return (__m512i)__builtin_ia32_alignd512_mask( (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_alignr_epi32(__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B, const int __imm) { +__funline __m512i _mm512_mask_alignr_epi32(__m512i __W, __mmask16 __U, + __m512i __A, __m512i __B, + const int __imm) { return (__m512i)__builtin_ia32_alignd512_mask( (__v16si)__A, (__v16si)__B, __imm, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_alignr_epi32(__mmask16 __U, __m512i __A, __m512i __B, - const int __imm) { +__funline __m512i _mm512_maskz_alignr_epi32(__mmask16 __U, __m512i __A, + __m512i __B, const int __imm) { return (__m512i)__builtin_ia32_alignd512_mask( (__v16si)__A, (__v16si)__B, __imm, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_alignr_epi64(__m512i __A, __m512i __B, const int __imm) { +__funline __m512i _mm512_alignr_epi64(__m512i __A, __m512i __B, const int __imm) { return (__m512i)__builtin_ia32_alignq512_mask( (__v8di)__A, (__v8di)__B, __imm, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_alignr_epi64(__m512i __W, __mmask8 __U, __m512i __A, - __m512i __B, const int __imm) { +__funline __m512i _mm512_mask_alignr_epi64(__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) { return (__m512i)__builtin_ia32_alignq512_mask((__v8di)__A, (__v8di)__B, __imm, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_alignr_epi64(__mmask8 __U, __m512i __A, __m512i __B, - const int __imm) { +__funline __m512i _mm512_maskz_alignr_epi64(__mmask8 __U, __m512i __A, + __m512i __B, const int __imm) { return (__m512i)__builtin_ia32_alignq512_mask((__v8di)__A, (__v8di)__B, __imm, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); @@ -7411,282 +6142,222 @@ extern __inline __m512i (__v8di)_mm512_setzero_si512(), (__mmask8)(U))) #endif -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpeq_epi32_mask(__m512i __A, __m512i __B) { +__funline __mmask16 _mm512_cmpeq_epi32_mask(__m512i __A, __m512i __B) { return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__A, (__v16si)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpeq_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __mmask16 _mm512_mask_cmpeq_epi32_mask(__mmask16 __U, __m512i __A, + __m512i __B) { return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__A, (__v16si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpeq_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __mmask8 _mm512_mask_cmpeq_epi64_mask(__mmask8 __U, __m512i __A, + __m512i __B) { return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__A, (__v8di)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpeq_epi64_mask(__m512i __A, __m512i __B) { +__funline __mmask8 _mm512_cmpeq_epi64_mask(__m512i __A, __m512i __B) { return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__A, (__v8di)__B, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpgt_epi32_mask(__m512i __A, __m512i __B) { +__funline __mmask16 _mm512_cmpgt_epi32_mask(__m512i __A, __m512i __B) { return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__A, (__v16si)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpgt_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __mmask16 _mm512_mask_cmpgt_epi32_mask(__mmask16 __U, __m512i __A, + __m512i __B) { return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__A, (__v16si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpgt_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __mmask8 _mm512_mask_cmpgt_epi64_mask(__mmask8 __U, __m512i __A, + __m512i __B) { return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__A, (__v8di)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpgt_epi64_mask(__m512i __A, __m512i __B) { +__funline __mmask8 _mm512_cmpgt_epi64_mask(__m512i __A, __m512i __B) { return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__A, (__v8di)__B, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpge_epi32_mask(__m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_cmpge_epi32_mask(__m512i __X, __m512i __Y) { return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 5, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpge_epi32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_mask_cmpge_epi32_mask(__mmask16 __M, __m512i __X, + __m512i __Y) { return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 5, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpge_epu32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_mask_cmpge_epu32_mask(__mmask16 __M, __m512i __X, + __m512i __Y) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 5, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpge_epu32_mask(__m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_cmpge_epu32_mask(__m512i __X, __m512i __Y) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 5, (__mmask16)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpge_epi64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_mask_cmpge_epi64_mask(__mmask8 __M, __m512i __X, + __m512i __Y) { return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpge_epi64_mask(__m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_cmpge_epi64_mask(__m512i __X, __m512i __Y) { return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpge_epu64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_mask_cmpge_epu64_mask(__mmask8 __M, __m512i __X, + __m512i __Y) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpge_epu64_mask(__m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_cmpge_epu64_mask(__m512i __X, __m512i __Y) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 5, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmple_epi32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_mask_cmple_epi32_mask(__mmask16 __M, __m512i __X, + __m512i __Y) { return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 2, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmple_epi32_mask(__m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_cmple_epi32_mask(__m512i __X, __m512i __Y) { return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 2, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmple_epu32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_mask_cmple_epu32_mask(__mmask16 __M, __m512i __X, + __m512i __Y) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 2, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmple_epu32_mask(__m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_cmple_epu32_mask(__m512i __X, __m512i __Y) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 2, (__mmask16)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmple_epi64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_mask_cmple_epi64_mask(__mmask8 __M, __m512i __X, + __m512i __Y) { return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 2, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmple_epi64_mask(__m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_cmple_epi64_mask(__m512i __X, __m512i __Y) { return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 2, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmple_epu64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_mask_cmple_epu64_mask(__mmask8 __M, __m512i __X, + __m512i __Y) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 2, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmple_epu64_mask(__m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_cmple_epu64_mask(__m512i __X, __m512i __Y) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 2, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmplt_epi32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_mask_cmplt_epi32_mask(__mmask16 __M, __m512i __X, + __m512i __Y) { return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 1, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmplt_epi32_mask(__m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_cmplt_epi32_mask(__m512i __X, __m512i __Y) { return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 1, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmplt_epu32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_mask_cmplt_epu32_mask(__mmask16 __M, __m512i __X, + __m512i __Y) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 1, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmplt_epu32_mask(__m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_cmplt_epu32_mask(__m512i __X, __m512i __Y) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 1, (__mmask16)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmplt_epi64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_mask_cmplt_epi64_mask(__mmask8 __M, __m512i __X, + __m512i __Y) { return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmplt_epi64_mask(__m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_cmplt_epi64_mask(__m512i __X, __m512i __Y) { return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmplt_epu64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_mask_cmplt_epu64_mask(__mmask8 __M, __m512i __X, + __m512i __Y) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmplt_epu64_mask(__m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_cmplt_epu64_mask(__m512i __X, __m512i __Y) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 1, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpneq_epi32_mask(__m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_cmpneq_epi32_mask(__m512i __X, __m512i __Y) { return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 4, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpneq_epi32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_mask_cmpneq_epi32_mask(__mmask16 __M, __m512i __X, + __m512i __Y) { return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, 4, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpneq_epu32_mask(__mmask16 __M, __m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_mask_cmpneq_epu32_mask(__mmask16 __M, __m512i __X, + __m512i __Y) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 4, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpneq_epu32_mask(__m512i __X, __m512i __Y) { +__funline __mmask16 _mm512_cmpneq_epu32_mask(__m512i __X, __m512i __Y) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, 4, (__mmask16)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpneq_epi64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_mask_cmpneq_epi64_mask(__mmask8 __M, __m512i __X, + __m512i __Y) { return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpneq_epi64_mask(__m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_cmpneq_epi64_mask(__m512i __X, __m512i __Y) { return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpneq_epu64_mask(__mmask8 __M, __m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_mask_cmpneq_epu64_mask(__mmask8 __M, __m512i __X, + __m512i __Y) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpneq_epu64_mask(__m512i __X, __m512i __Y) { +__funline __mmask8 _mm512_cmpneq_epu64_mask(__m512i __X, __m512i __Y) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, 4, (__mmask8)-1); } @@ -7702,135 +6373,110 @@ extern __inline __mmask8 #define _MM_CMPINT_GT 0x6 #ifdef __OPTIMIZE__ -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kshiftli_mask16(__mmask16 __A, unsigned int __B) { +__funline __mmask16 _kshiftli_mask16(__mmask16 __A, unsigned int __B) { return (__mmask16)__builtin_ia32_kshiftlihi((__mmask16)__A, (__mmask8)__B); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kshiftri_mask16(__mmask16 __A, unsigned int __B) { +__funline __mmask16 _kshiftri_mask16(__mmask16 __A, unsigned int __B) { return (__mmask16)__builtin_ia32_kshiftrihi((__mmask16)__A, (__mmask8)__B); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_epi64_mask(__m512i __X, __m512i __Y, const int __P) { +__funline __mmask8 _mm512_cmp_epi64_mask(__m512i __X, __m512i __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, __P, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_epi32_mask(__m512i __X, __m512i __Y, const int __P) { +__funline __mmask16 _mm512_cmp_epi32_mask(__m512i __X, __m512i __Y, + const int __P) { return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, __P, (__mmask16)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_epu64_mask(__m512i __X, __m512i __Y, const int __P) { +__funline __mmask8 _mm512_cmp_epu64_mask(__m512i __X, __m512i __Y, + const int __P) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, __P, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_epu32_mask(__m512i __X, __m512i __Y, const int __P) { +__funline __mmask16 _mm512_cmp_epu32_mask(__m512i __X, __m512i __Y, + const int __P) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, __P, (__mmask16)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_round_pd_mask(__m512d __X, __m512d __Y, const int __P, - const int __R) { +__funline __mmask8 _mm512_cmp_round_pd_mask(__m512d __X, __m512d __Y, + const int __P, const int __R) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, __P, (__mmask8)-1, __R); } -extern __inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_cmp_round_ps_mask(__m512 __X, __m512 __Y, const int __P, const int __R) { +__funline __mmask16 _mm512_cmp_round_ps_mask(__m512 __X, __m512 __Y, + const int __P, const int __R) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, __P, (__mmask16)-1, __R); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmp_epi64_mask(__mmask8 __U, __m512i __X, __m512i __Y, - const int __P) { +__funline __mmask8 _mm512_mask_cmp_epi64_mask(__mmask8 __U, __m512i __X, + __m512i __Y, const int __P) { return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__X, (__v8di)__Y, __P, (__mmask8)__U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmp_epi32_mask(__mmask16 __U, __m512i __X, __m512i __Y, - const int __P) { +__funline __mmask16 _mm512_mask_cmp_epi32_mask(__mmask16 __U, __m512i __X, + __m512i __Y, const int __P) { return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__X, (__v16si)__Y, __P, (__mmask16)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmp_epu64_mask(__mmask8 __U, __m512i __X, __m512i __Y, - const int __P) { +__funline __mmask8 _mm512_mask_cmp_epu64_mask(__mmask8 __U, __m512i __X, + __m512i __Y, const int __P) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__X, (__v8di)__Y, __P, (__mmask8)__U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmp_epu32_mask(__mmask16 __U, __m512i __X, __m512i __Y, - const int __P) { +__funline __mmask16 _mm512_mask_cmp_epu32_mask(__mmask16 __U, __m512i __X, + __m512i __Y, const int __P) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__X, (__v16si)__Y, __P, (__mmask16)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmp_round_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y, - const int __P, const int __R) { +__funline __mmask8 _mm512_mask_cmp_round_pd_mask(__mmask8 __U, __m512d __X, + __m512d __Y, const int __P, + const int __R) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, __P, (__mmask8)__U, __R); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmp_round_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y, - const int __P, const int __R) { +__funline __mmask16 _mm512_mask_cmp_round_ps_mask(__mmask16 __U, __m512 __X, + __m512 __Y, const int __P, + const int __R) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, __P, (__mmask16)__U, __R); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_cmp_round_sd_mask(__m128d __X, __m128d __Y, const int __P, const int __R) { +__funline __mmask8 _mm_cmp_round_sd_mask(__m128d __X, __m128d __Y, const int __P, + const int __R) { return (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)__X, (__v2df)__Y, __P, (__mmask8)-1, __R); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmp_round_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y, - const int __P, const int __R) { +__funline __mmask8 _mm_mask_cmp_round_sd_mask(__mmask8 __M, __m128d __X, + __m128d __Y, const int __P, + const int __R) { return (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)__X, (__v2df)__Y, __P, (__mmask8)__M, __R); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_cmp_round_ss_mask(__m128 __X, __m128 __Y, const int __P, const int __R) { +__funline __mmask8 _mm_cmp_round_ss_mask(__m128 __X, __m128 __Y, const int __P, + const int __R) { return (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)__X, (__v4sf)__Y, __P, (__mmask8)-1, __R); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmp_round_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y, - const int __P, const int __R) { +__funline __mmask8 _mm_mask_cmp_round_ss_mask(__mmask8 __M, __m128 __X, + __m128 __Y, const int __P, + const int __R) { return (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)__X, (__v4sf)__Y, __P, (__mmask8)__M, __R); } @@ -7908,9 +6554,8 @@ extern __inline __mmask8 #endif #ifdef __OPTIMIZE__ -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i32gather_ps(__m512i __index, void const *__addr, int __scale) { +__funline __m512 _mm512_i32gather_ps(__m512i __index, void const *__addr, + int __scale) { __m512 __v1_old = _mm512_undefined_ps(); __mmask16 __mask = 0xFFFF; @@ -7918,17 +6563,15 @@ extern __inline __m512 (__v16sf)__v1_old, __addr, (__v16si)__index, __mask, __scale); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i32gather_ps(__m512 __v1_old, __mmask16 __mask, __m512i __index, - void const *__addr, int __scale) { +__funline __m512 _mm512_mask_i32gather_ps(__m512 __v1_old, __mmask16 __mask, + __m512i __index, void const *__addr, + int __scale) { return (__m512)__builtin_ia32_gathersiv16sf( (__v16sf)__v1_old, __addr, (__v16si)__index, __mask, __scale); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i32gather_pd(__m256i __index, void const *__addr, int __scale) { +__funline __m512d _mm512_i32gather_pd(__m256i __index, void const *__addr, + int __scale) { __m512d __v1_old = _mm512_undefined_pd(); __mmask8 __mask = 0xFF; @@ -7936,17 +6579,15 @@ extern __inline __m512d (__v8si)__index, __mask, __scale); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i32gather_pd(__m512d __v1_old, __mmask8 __mask, __m256i __index, - void const *__addr, int __scale) { +__funline __m512d _mm512_mask_i32gather_pd(__m512d __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { return (__m512d)__builtin_ia32_gathersiv8df((__v8df)__v1_old, __addr, (__v8si)__index, __mask, __scale); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i64gather_ps(__m512i __index, void const *__addr, int __scale) { +__funline __m256 _mm512_i64gather_ps(__m512i __index, void const *__addr, + int __scale) { __m256 __v1_old = _mm256_undefined_ps(); __mmask8 __mask = 0xFF; @@ -7954,17 +6595,15 @@ extern __inline __m256 (__v8di)__index, __mask, __scale); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i64gather_ps(__m256 __v1_old, __mmask8 __mask, __m512i __index, - void const *__addr, int __scale) { +__funline __m256 _mm512_mask_i64gather_ps(__m256 __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, + int __scale) { return (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)__v1_old, __addr, (__v8di)__index, __mask, __scale); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i64gather_pd(__m512i __index, void const *__addr, int __scale) { +__funline __m512d _mm512_i64gather_pd(__m512i __index, void const *__addr, + int __scale) { __m512d __v1_old = _mm512_undefined_pd(); __mmask8 __mask = 0xFF; @@ -7972,17 +6611,15 @@ extern __inline __m512d (__v8di)__index, __mask, __scale); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i64gather_pd(__m512d __v1_old, __mmask8 __mask, __m512i __index, - void const *__addr, int __scale) { +__funline __m512d _mm512_mask_i64gather_pd(__m512d __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, + int __scale) { return (__m512d)__builtin_ia32_gatherdiv8df((__v8df)__v1_old, __addr, (__v8di)__index, __mask, __scale); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i32gather_epi32(__m512i __index, void const *__addr, int __scale) { +__funline __m512i _mm512_i32gather_epi32(__m512i __index, void const *__addr, + int __scale) { __m512i __v1_old = _mm512_undefined_epi32(); __mmask16 __mask = 0xFFFF; @@ -7990,18 +6627,15 @@ extern __inline __m512i (__v16si)__v1_old, __addr, (__v16si)__index, __mask, __scale); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i32gather_epi32(__m512i __v1_old, __mmask16 __mask, - __m512i __index, void const *__addr, - int __scale) { +__funline __m512i _mm512_mask_i32gather_epi32(__m512i __v1_old, __mmask16 __mask, + __m512i __index, void const *__addr, + int __scale) { return (__m512i)__builtin_ia32_gathersiv16si( (__v16si)__v1_old, __addr, (__v16si)__index, __mask, __scale); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i32gather_epi64(__m256i __index, void const *__addr, int __scale) { +__funline __m512i _mm512_i32gather_epi64(__m256i __index, void const *__addr, + int __scale) { __m512i __v1_old = _mm512_undefined_epi32(); __mmask8 __mask = 0xFF; @@ -8009,18 +6643,15 @@ extern __inline __m512i (__v8si)__index, __mask, __scale); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, - __m256i __index, void const *__addr, - int __scale) { +__funline __m512i _mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { return (__m512i)__builtin_ia32_gathersiv8di((__v8di)__v1_old, __addr, (__v8si)__index, __mask, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i64gather_epi32(__m512i __index, void const *__addr, int __scale) { +__funline __m256i _mm512_i64gather_epi32(__m512i __index, void const *__addr, + int __scale) { __m256i __v1_old = _mm256_undefined_si256(); __mmask8 __mask = 0xFF; @@ -8028,18 +6659,15 @@ extern __inline __m256i (__v8si)__v1_old, __addr, (__v8di)__index, __mask, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i64gather_epi32(__m256i __v1_old, __mmask8 __mask, - __m512i __index, void const *__addr, - int __scale) { +__funline __m256i _mm512_mask_i64gather_epi32(__m256i __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, + int __scale) { return (__m256i)__builtin_ia32_gatherdiv16si( (__v8si)__v1_old, __addr, (__v8di)__index, __mask, __scale); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i64gather_epi64(__m512i __index, void const *__addr, int __scale) { +__funline __m512i _mm512_i64gather_epi64(__m512i __index, void const *__addr, + int __scale) { __m512i __v1_old = _mm512_undefined_epi32(); __mmask8 __mask = 0xFF; @@ -8047,135 +6675,113 @@ extern __inline __m512i (__v8di)__index, __mask, __scale); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i64gather_epi64(__m512i __v1_old, __mmask8 __mask, - __m512i __index, void const *__addr, - int __scale) { +__funline __m512i _mm512_mask_i64gather_epi64(__m512i __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, + int __scale) { return (__m512i)__builtin_ia32_gatherdiv8di((__v8di)__v1_old, __addr, (__v8di)__index, __mask, __scale); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_i32scatter_ps(void *__addr, __m512i __index, __m512 __v1, int __scale) { +__funline void _mm512_i32scatter_ps(void *__addr, __m512i __index, __m512 __v1, + int __scale) { __builtin_ia32_scattersiv16sf(__addr, (__mmask16)0xFFFF, (__v16si)__index, (__v16sf)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i32scatter_ps(void *__addr, __mmask16 __mask, __m512i __index, - __m512 __v1, int __scale) { +__funline void _mm512_mask_i32scatter_ps(void *__addr, __mmask16 __mask, + __m512i __index, __m512 __v1, + int __scale) { __builtin_ia32_scattersiv16sf(__addr, __mask, (__v16si)__index, (__v16sf)__v1, __scale); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_i32scatter_pd(void *__addr, __m256i __index, __m512d __v1, int __scale) { +__funline void _mm512_i32scatter_pd(void *__addr, __m256i __index, __m512d __v1, + int __scale) { __builtin_ia32_scattersiv8df(__addr, (__mmask8)0xFF, (__v8si)__index, (__v8df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i32scatter_pd(void *__addr, __mmask8 __mask, __m256i __index, - __m512d __v1, int __scale) { +__funline void _mm512_mask_i32scatter_pd(void *__addr, __mmask8 __mask, + __m256i __index, __m512d __v1, + int __scale) { __builtin_ia32_scattersiv8df(__addr, __mask, (__v8si)__index, (__v8df)__v1, __scale); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_i64scatter_ps(void *__addr, __m512i __index, __m256 __v1, int __scale) { +__funline void _mm512_i64scatter_ps(void *__addr, __m512i __index, __m256 __v1, + int __scale) { __builtin_ia32_scatterdiv16sf(__addr, (__mmask8)0xFF, (__v8di)__index, (__v8sf)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i64scatter_ps(void *__addr, __mmask8 __mask, __m512i __index, - __m256 __v1, int __scale) { +__funline void _mm512_mask_i64scatter_ps(void *__addr, __mmask8 __mask, + __m512i __index, __m256 __v1, + int __scale) { __builtin_ia32_scatterdiv16sf(__addr, __mask, (__v8di)__index, (__v8sf)__v1, __scale); } -extern __inline void __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_i64scatter_pd(void *__addr, __m512i __index, __m512d __v1, int __scale) { +__funline void _mm512_i64scatter_pd(void *__addr, __m512i __index, __m512d __v1, + int __scale) { __builtin_ia32_scatterdiv8df(__addr, (__mmask8)0xFF, (__v8di)__index, (__v8df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i64scatter_pd(void *__addr, __mmask8 __mask, __m512i __index, - __m512d __v1, int __scale) { +__funline void _mm512_mask_i64scatter_pd(void *__addr, __mmask8 __mask, + __m512i __index, __m512d __v1, + int __scale) { __builtin_ia32_scatterdiv8df(__addr, __mask, (__v8di)__index, (__v8df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i32scatter_epi32(void *__addr, __m512i __index, __m512i __v1, - int __scale) { +__funline void _mm512_i32scatter_epi32(void *__addr, __m512i __index, + __m512i __v1, int __scale) { __builtin_ia32_scattersiv16si(__addr, (__mmask16)0xFFFF, (__v16si)__index, (__v16si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i32scatter_epi32(void *__addr, __mmask16 __mask, - __m512i __index, __m512i __v1, int __scale) { +__funline void _mm512_mask_i32scatter_epi32(void *__addr, __mmask16 __mask, + __m512i __index, __m512i __v1, + int __scale) { __builtin_ia32_scattersiv16si(__addr, __mask, (__v16si)__index, (__v16si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i32scatter_epi64(void *__addr, __m256i __index, __m512i __v1, - int __scale) { +__funline void _mm512_i32scatter_epi64(void *__addr, __m256i __index, + __m512i __v1, int __scale) { __builtin_ia32_scattersiv8di(__addr, (__mmask8)0xFF, (__v8si)__index, (__v8di)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, __m256i __index, - __m512i __v1, int __scale) { +__funline void _mm512_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, + __m256i __index, __m512i __v1, + int __scale) { __builtin_ia32_scattersiv8di(__addr, __mask, (__v8si)__index, (__v8di)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i64scatter_epi32(void *__addr, __m512i __index, __m256i __v1, - int __scale) { +__funline void _mm512_i64scatter_epi32(void *__addr, __m512i __index, + __m256i __v1, int __scale) { __builtin_ia32_scatterdiv16si(__addr, (__mmask8)0xFF, (__v8di)__index, (__v8si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m512i __index, - __m256i __v1, int __scale) { +__funline void _mm512_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, + __m512i __index, __m256i __v1, + int __scale) { __builtin_ia32_scatterdiv16si(__addr, __mask, (__v8di)__index, (__v8si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_i64scatter_epi64(void *__addr, __m512i __index, __m512i __v1, - int __scale) { +__funline void _mm512_i64scatter_epi64(void *__addr, __m512i __index, + __m512i __v1, int __scale) { __builtin_ia32_scatterdiv8di(__addr, (__mmask8)0xFF, (__v8di)__index, (__v8di)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, __m512i __index, - __m512i __v1, int __scale) { +__funline void _mm512_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, + __m512i __index, __m512i __v1, + int __scale) { __builtin_ia32_scatterdiv8di(__addr, __mask, (__v8di)__index, (__v8di)__v1, __scale); } @@ -8341,198 +6947,155 @@ extern __inline void (int)SCALE) #endif -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compress_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_compress_pd(__m512d __W, __mmask8 __U, + __m512d __A) { return (__m512d)__builtin_ia32_compressdf512_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_compress_pd(__mmask8 __U, __m512d __A) { +__funline __m512d _mm512_maskz_compress_pd(__mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_compressdf512_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m512d __A) { +__funline void _mm512_mask_compressstoreu_pd(void *__P, __mmask8 __U, + __m512d __A) { __builtin_ia32_compressstoredf512_mask((__v8df *)__P, (__v8df)__A, (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compress_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_compress_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_compresssf512_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_compress_ps(__mmask16 __U, __m512 __A) { +__funline __m512 _mm512_maskz_compress_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_compresssf512_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compressstoreu_ps(void *__P, __mmask16 __U, __m512 __A) { +__funline void _mm512_mask_compressstoreu_ps(void *__P, __mmask16 __U, + __m512 __A) { __builtin_ia32_compressstoresf512_mask((__v16sf *)__P, (__v16sf)__A, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compress_epi64(__m512i __W, __mmask8 __U, __m512i __A) { +__funline __m512i _mm512_mask_compress_epi64(__m512i __W, __mmask8 __U, + __m512i __A) { return (__m512i)__builtin_ia32_compressdi512_mask((__v8di)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_compress_epi64(__mmask8 __U, __m512i __A) { +__funline __m512i _mm512_maskz_compress_epi64(__mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_compressdi512_mask( (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m512i __A) { +__funline void _mm512_mask_compressstoreu_epi64(void *__P, __mmask8 __U, + __m512i __A) { __builtin_ia32_compressstoredi512_mask((__v8di *)__P, (__v8di)__A, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compress_epi32(__m512i __W, __mmask16 __U, __m512i __A) { +__funline __m512i _mm512_mask_compress_epi32(__m512i __W, __mmask16 __U, + __m512i __A) { return (__m512i)__builtin_ia32_compresssi512_mask((__v16si)__A, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_compress_epi32(__mmask16 __U, __m512i __A) { +__funline __m512i _mm512_maskz_compress_epi32(__mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_compresssi512_mask( (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compressstoreu_epi32(void *__P, __mmask16 __U, __m512i __A) { +__funline void _mm512_mask_compressstoreu_epi32(void *__P, __mmask16 __U, + __m512i __A) { __builtin_ia32_compressstoresi512_mask((__v16si *)__P, (__v16si)__A, (__mmask16)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expand_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_expand_pd(__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_expanddf512_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expand_pd(__mmask8 __U, __m512d __A) { +__funline __m512d _mm512_maskz_expand_pd(__mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_expanddf512_maskz( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) { +__funline __m512d _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, + void const *__P) { return (__m512d)__builtin_ia32_expandloaddf512_mask( (const __v8df *)__P, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { +__funline __m512d _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { return (__m512d)__builtin_ia32_expandloaddf512_maskz( (const __v8df *)__P, (__v8df)_mm512_setzero_pd(), (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expand_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_expand_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_expandsf512_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expand_ps(__mmask16 __U, __m512 __A) { +__funline __m512 _mm512_maskz_expand_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_expandsf512_maskz( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) { +__funline __m512 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, + void const *__P) { return (__m512)__builtin_ia32_expandloadsf512_mask( (const __v16sf *)__P, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) { +__funline __m512 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) { return (__m512)__builtin_ia32_expandloadsf512_maskz( (const __v16sf *)__P, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expand_epi64(__m512i __W, __mmask8 __U, __m512i __A) { +__funline __m512i _mm512_mask_expand_epi64(__m512i __W, __mmask8 __U, + __m512i __A) { return (__m512i)__builtin_ia32_expanddi512_mask((__v8di)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expand_epi64(__mmask8 __U, __m512i __A) { +__funline __m512i _mm512_maskz_expand_epi64(__mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_expanddi512_maskz( (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) { +__funline __m512i _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, + void const *__P) { return (__m512i)__builtin_ia32_expandloaddi512_mask( (const __v8di *)__P, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { +__funline __m512i _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { return (__m512i)__builtin_ia32_expandloaddi512_maskz( (const __v8di *)__P, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expand_epi32(__m512i __W, __mmask16 __U, __m512i __A) { +__funline __m512i _mm512_mask_expand_epi32(__m512i __W, __mmask16 __U, + __m512i __A) { return (__m512i)__builtin_ia32_expandsi512_mask((__v16si)__A, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expand_epi32(__mmask16 __U, __m512i __A) { +__funline __m512i _mm512_maskz_expand_epi32(__mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_expandsi512_maskz( (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) { +__funline __m512i _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, + void const *__P) { return (__m512i)__builtin_ia32_expandloadsi512_mask( (const __v16si *)__P, (__v16si)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) { +__funline __m512i _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) { return (__m512i)__builtin_ia32_expandloadsi512_maskz( (const __v16si *)__P, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } @@ -8545,140 +7108,99 @@ extern __inline __m512i #define _kxnor_mask16 _mm512_kxnor #define _kxor_mask16 _mm512_kxor -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__CF) { +__funline unsigned char _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, + unsigned char *__CF) { *__CF = (unsigned char)__builtin_ia32_kortestchi(__A, __B); return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) { +__funline unsigned char _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) { return (unsigned char)__builtin_ia32_kortestzhi((__mmask16)__A, (__mmask16)__B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) { +__funline unsigned char _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) { return (unsigned char)__builtin_ia32_kortestchi((__mmask16)__A, (__mmask16)__B); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cvtmask16_u32(__mmask16 __A) { +__funline unsigned int _cvtmask16_u32(__mmask16 __A) { return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cvtu32_mask16(unsigned int __A) { +__funline __mmask16 _cvtu32_mask16(unsigned int __A) { return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _load_mask16(__mmask16 *__A) { +__funline __mmask16 _load_mask16(__mmask16 *__A) { return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _store_mask16(__mmask16 *__A, __mmask16 __B) { +__funline void _store_mask16(__mmask16 *__A, __mmask16 __B) { *(__mmask16 *)__A = __builtin_ia32_kmovw(__B); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kand(__mmask16 __A, __mmask16 __B) { +__funline __mmask16 _mm512_kand(__mmask16 __A, __mmask16 __B) { return (__mmask16)__builtin_ia32_kandhi((__mmask16)__A, (__mmask16)__B); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kandn(__mmask16 __A, __mmask16 __B) { +__funline __mmask16 _mm512_kandn(__mmask16 __A, __mmask16 __B) { return (__mmask16)__builtin_ia32_kandnhi((__mmask16)__A, (__mmask16)__B); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kor(__mmask16 __A, __mmask16 __B) { +__funline __mmask16 _mm512_kor(__mmask16 __A, __mmask16 __B) { return (__mmask16)__builtin_ia32_korhi((__mmask16)__A, (__mmask16)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kortestz(__mmask16 __A, __mmask16 __B) { +__funline int _mm512_kortestz(__mmask16 __A, __mmask16 __B) { return (__mmask16)__builtin_ia32_kortestzhi((__mmask16)__A, (__mmask16)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kortestc(__mmask16 __A, __mmask16 __B) { +__funline int _mm512_kortestc(__mmask16 __A, __mmask16 __B) { return (__mmask16)__builtin_ia32_kortestchi((__mmask16)__A, (__mmask16)__B); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kxnor(__mmask16 __A, __mmask16 __B) { +__funline __mmask16 _mm512_kxnor(__mmask16 __A, __mmask16 __B) { return (__mmask16)__builtin_ia32_kxnorhi((__mmask16)__A, (__mmask16)__B); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kxor(__mmask16 __A, __mmask16 __B) { +__funline __mmask16 _mm512_kxor(__mmask16 __A, __mmask16 __B) { return (__mmask16)__builtin_ia32_kxorhi((__mmask16)__A, (__mmask16)__B); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_knot(__mmask16 __A) { +__funline __mmask16 _mm512_knot(__mmask16 __A) { return (__mmask16)__builtin_ia32_knothi((__mmask16)__A); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kunpackb(__mmask16 __A, __mmask16 __B) { +__funline __mmask16 _mm512_kunpackb(__mmask16 __A, __mmask16 __B) { return (__mmask16)__builtin_ia32_kunpckhi((__mmask16)__A, (__mmask16)__B); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _kunpackb_mask16(__mmask8 __A, __mmask8 __B) { +__funline __mmask16 _kunpackb_mask16(__mmask8 __A, __mmask8 __B) { return (__mmask16)__builtin_ia32_kunpckhi((__mmask16)__A, (__mmask16)__B); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_inserti32x4(__mmask16 __B, __m512i __C, __m128i __D, - const int __imm) { +__funline __m512i _mm512_maskz_inserti32x4(__mmask16 __B, __m512i __C, + __m128i __D, const int __imm) { return (__m512i)__builtin_ia32_inserti32x4_mask( (__v16si)__C, (__v4si)__D, __imm, (__v16si)_mm512_setzero_si512(), __B); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_insertf32x4(__mmask16 __B, __m512 __C, __m128 __D, - const int __imm) { +__funline __m512 _mm512_maskz_insertf32x4(__mmask16 __B, __m512 __C, __m128 __D, + const int __imm) { return (__m512)__builtin_ia32_insertf32x4_mask( (__v16sf)__C, (__v4sf)__D, __imm, (__v16sf)_mm512_setzero_ps(), __B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_inserti32x4(__m512i __A, __mmask16 __B, __m512i __C, - __m128i __D, const int __imm) { +__funline __m512i _mm512_mask_inserti32x4(__m512i __A, __mmask16 __B, __m512i __C, + __m128i __D, const int __imm) { return (__m512i)__builtin_ia32_inserti32x4_mask((__v16si)__C, (__v4si)__D, __imm, (__v16si)__A, __B); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_insertf32x4(__m512 __A, __mmask16 __B, __m512 __C, __m128 __D, - const int __imm) { +__funline __m512 _mm512_mask_insertf32x4(__m512 __A, __mmask16 __B, __m512 __C, + __m128 __D, const int __imm) { return (__m512)__builtin_ia32_insertf32x4_mask((__v16sf)__C, (__v4sf)__D, __imm, (__v16sf)__A, __B); } @@ -8704,281 +7226,220 @@ extern __inline __m512 (__v16si)(__m512i)(A), (__mmask16)(B))) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_max_epi64(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxsq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_epi64(__mmask8 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_max_epi64(__mmask8 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxsq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_max_epi64(__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_max_epi64(__m512i __W, __mmask8 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxsq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_epi64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_min_epi64(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminsq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_min_epi64(__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_min_epi64(__m512i __W, __mmask8 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminsq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_epi64(__mmask8 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_min_epi64(__mmask8 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminsq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_epu64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_max_epu64(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxuq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_epu64(__mmask8 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_max_epu64(__mmask8 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxuq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_max_epu64(__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_max_epu64(__m512i __W, __mmask8 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxuq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_epu64(__m512i __A, __m512i __B) { +__funline __m512i _mm512_min_epu64(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminuq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_min_epu64(__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_min_epu64(__m512i __W, __mmask8 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminuq512_mask((__v8di)__A, (__v8di)__B, (__v8di)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_epu64(__mmask8 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_min_epu64(__mmask8 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminuq512_mask( (__v8di)__A, (__v8di)__B, (__v8di)_mm512_setzero_si512(), __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_max_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxsd512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_epi32(__mmask16 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_max_epi32(__mmask16 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxsd512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_max_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_max_epi32(__m512i __W, __mmask16 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxsd512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_epi32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_min_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminsd512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_epi32(__mmask16 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_min_epi32(__mmask16 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminsd512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_min_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_min_epi32(__m512i __W, __mmask16 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminsd512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_epu32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_max_epu32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmaxud512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_epu32(__mmask16 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_max_epu32(__mmask16 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxud512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_max_epu32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_max_epu32(__m512i __W, __mmask16 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pmaxud512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, __M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_epu32(__m512i __A, __m512i __B) { +__funline __m512i _mm512_min_epu32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pminud512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_epu32(__mmask16 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_min_epu32(__mmask16 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminud512_mask( (__v16si)__A, (__v16si)__B, (__v16si)_mm512_setzero_si512(), __M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_min_epu32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_mask_min_epu32(__m512i __W, __mmask16 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_pminud512_mask((__v16si)__A, (__v16si)__B, (__v16si)__W, __M); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_unpacklo_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_unpacklo_ps(__m512 __A, __m512 __B) { return (__m512)__builtin_ia32_unpcklps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_unpcklps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_unpacklo_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_unpacklo_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_unpcklps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_round_sd(__m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_max_round_sd(__m128d __A, __m128d __B, const int __R) { return (__m128d)__builtin_ia32_maxsd_round((__v2df)__A, (__v2df)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_mask_max_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_maxsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_max_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_maskz_max_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_maxsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_round_ss(__m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_max_round_ss(__m128 __A, __m128 __B, const int __R) { return (__m128)__builtin_ia32_maxss_round((__v4sf)__A, (__v4sf)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_max_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_maxss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_max_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_maskz_max_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_maxss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_round_sd(__m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_min_round_sd(__m128d __A, __m128d __B, const int __R) { return (__m128d)__builtin_ia32_minsd_round((__v2df)__A, (__v2df)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_mask_min_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_minsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, __R); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_min_round_sd(__mmask8 __U, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_maskz_min_round_sd(__mmask8 __U, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_minsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_round_ss(__m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_min_round_ss(__m128 __A, __m128 __B, const int __R) { return (__m128)__builtin_ia32_minss_round((__v4sf)__A, (__v4sf)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_min_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_minss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, __R); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_min_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_maskz_min_round_ss(__mmask8 __U, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_minss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, __R); } @@ -9020,87 +7481,73 @@ _mm_maskz_min_round_ss(__mmask8 __U, __m128 __A, __m128 __B, const int __R) { #endif -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) { +__funline __m512d _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) { return (__m512d)__builtin_ia32_blendmpd_512_mask((__v8df)__A, (__v8df)__W, (__mmask8)__U); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) { +__funline __m512 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) { return (__m512)__builtin_ia32_blendmps_512_mask((__v16sf)__A, (__v16sf)__W, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) { +__funline __m512i _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, + __m512i __W) { return (__m512i)__builtin_ia32_blendmq_512_mask((__v8di)__A, (__v8di)__W, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) { +__funline __m512i _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, + __m512i __W) { return (__m512i)__builtin_ia32_blendmd_512_mask((__v16si)__A, (__v16si)__W, (__mmask16)__U); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_fmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, (__v2df)__A, (__v2df)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_fmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, (__v4sf)__A, (__v4sf)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_fmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, (__v2df)__A, -(__v2df)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_fmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, (__v4sf)__A, -(__v4sf)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_fnmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, -(__v2df)__A, (__v2df)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_fnmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, const int __R) { +__funline __m128d _mm_fnmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_round((__v2df)__W, -(__v2df)__A, -(__v2df)__B, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, const int __R) { +__funline __m128 _mm_fnmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, + const int __R) { return (__m128)__builtin_ia32_vfmaddss3_round((__v4sf)__W, -(__v4sf)__A, -(__v4sf)__B, __R); } @@ -9130,387 +7577,317 @@ extern __inline __m128 (__m128) __builtin_ia32_vfmaddss3_round(A, -(B), -(C), R) #endif -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_fmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_fmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmadd_sd(__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) { +__funline __m128d _mm_mask3_fmadd_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U) { return (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmadd_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) { +__funline __m128 _mm_mask3_fmadd_ss(__m128 __W, __m128 __A, __m128 __B, + __mmask8 __U) { return (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmadd_sd(__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_fmadd_sd(__mmask8 __U, __m128d __W, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmadd_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_fmadd_ss(__mmask8 __U, __m128 __W, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_fmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, (__v2df)__A, -(__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_fmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, (__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmsub_sd(__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) { +__funline __m128d _mm_mask3_fmsub_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U) { return (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmsub_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) { +__funline __m128 _mm_mask3_fmsub_ss(__m128 __W, __m128 __A, __m128 __B, + __mmask8 __U) { return (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmsub_sd(__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_fmsub_sd(__mmask8 __U, __m128d __W, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, (__v2df)__A, -(__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmsub_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_fmsub_ss(__mmask8 __U, __m128 __W, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, (__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_fnmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_fnmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmadd_sd(__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) { +__funline __m128d _mm_mask3_fnmadd_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U) { return (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmadd_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) { +__funline __m128 _mm_mask3_fnmadd_ss(__m128 __W, __m128 __A, __m128 __B, + __mmask8 __U) { return (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmadd_sd(__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_fnmadd_sd(__mmask8 __U, __m128d __W, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmadd_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_fnmadd_ss(__mmask8 __U, __m128 __W, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_fnmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)__W, -(__v2df)__A, -(__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_fnmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, -(__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmsub_sd(__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) { +__funline __m128d _mm_mask3_fnmsub_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U) { return (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmsub_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) { +__funline __m128 _mm_mask3_fnmsub_ss(__m128 __W, __m128 __A, __m128 __B, + __mmask8 __U) { return (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmsub_sd(__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_fnmsub_sd(__mmask8 __U, __m128d __W, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)__W, -(__v2df)__A, -(__v2df)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmsub_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_fnmsub_ss(__mmask8 __U, __m128 __W, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)__W, -(__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_mask_fmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_mask( (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_fmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U, const int __R) { +__funline __m128d _mm_mask3_fmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U, const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_mask3( (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, - const int __R) { +__funline __m128 _mm_mask3_fmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, + __mmask8 __U, const int __R) { return (__m128)__builtin_ia32_vfmaddss3_mask3( (__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmadd_round_sd(__mmask8 __U, __m128d __W, __m128d __A, - __m128d __B, const int __R) { +__funline __m128d _mm_maskz_fmadd_round_sd(__mmask8 __U, __m128d __W, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_maskz( (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmadd_round_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_maskz_fmadd_round_ss(__mmask8 __U, __m128 __W, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_vfmaddss3_maskz( (__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __R) { +__funline __m128d _mm_mask_fmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_mask( (__v2df)__W, (__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_fmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_vfmaddss3_mask( (__v4sf)__W, (__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U, const int __R) { +__funline __m128d _mm_mask3_fmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U, const int __R) { return (__m128d)__builtin_ia32_vfmsubsd3_mask3( (__v2df)__W, (__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, - const int __R) { +__funline __m128 _mm_mask3_fmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, + __mmask8 __U, const int __R) { return (__m128)__builtin_ia32_vfmsubss3_mask3( (__v4sf)__W, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmsub_round_sd(__mmask8 __U, __m128d __W, __m128d __A, - __m128d __B, const int __R) { +__funline __m128d _mm_maskz_fmsub_round_sd(__mmask8 __U, __m128d __W, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_maskz( (__v2df)__W, (__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmsub_round_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_maskz_fmsub_round_ss(__mmask8 __U, __m128 __W, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_vfmaddss3_maskz( (__v4sf)__W, (__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { +__funline __m128d _mm_mask_fnmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_mask( (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_fnmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U, const int __R) { +__funline __m128d _mm_mask3_fnmadd_round_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U, const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_mask3( (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, - const int __R) { +__funline __m128 _mm_mask3_fnmadd_round_ss(__m128 __W, __m128 __A, __m128 __B, + __mmask8 __U, const int __R) { return (__m128)__builtin_ia32_vfmaddss3_mask3( (__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmadd_round_sd(__mmask8 __U, __m128d __W, __m128d __A, - __m128d __B, const int __R) { +__funline __m128d _mm_maskz_fnmadd_round_sd(__mmask8 __U, __m128d __W, + __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_maskz( (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmadd_round_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_maskz_fnmadd_round_ss(__mmask8 __U, __m128 __W, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_vfmaddss3_maskz( (__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B, const int __R) { +__funline __m128d _mm_mask_fnmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_mask( (__v2df)__W, -(__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_mask_fnmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_vfmaddss3_mask( (__v4sf)__W, -(__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, - __mmask8 __U, const int __R) { +__funline __m128d _mm_mask3_fnmsub_round_sd(__m128d __W, __m128d __A, __m128d __B, + __mmask8 __U, const int __R) { return (__m128d)__builtin_ia32_vfmsubsd3_mask3( (__v2df)__W, -(__v2df)__A, (__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, - const int __R) { +__funline __m128 _mm_mask3_fnmsub_round_ss(__m128 __W, __m128 __A, __m128 __B, + __mmask8 __U, const int __R) { return (__m128)__builtin_ia32_vfmsubss3_mask3( (__v4sf)__W, -(__v4sf)__A, (__v4sf)__B, (__mmask8)__U, __R); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmsub_round_sd(__mmask8 __U, __m128d __W, __m128d __A, - __m128d __B, const int __R) { +__funline __m128d _mm_maskz_fnmsub_round_sd(__mmask8 __U, __m128d __W, + __m128d __A, __m128d __B, + const int __R) { return (__m128d)__builtin_ia32_vfmaddsd3_maskz( (__v2df)__W, -(__v2df)__A, -(__v2df)__B, (__mmask8)__U, __R); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmsub_round_ss(__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, - const int __R) { +__funline __m128 _mm_maskz_fnmsub_round_ss(__mmask8 __U, __m128 __W, __m128 __A, + __m128 __B, const int __R) { return (__m128)__builtin_ia32_vfmaddss3_maskz( (__v4sf)__W, -(__v4sf)__A, -(__v4sf)__B, (__mmask8)__U, __R); } @@ -9589,15 +7966,13 @@ extern __inline __m128 #endif #ifdef __OPTIMIZE__ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comi_round_ss(__m128 __A, __m128 __B, const int __P, const int __R) { +__funline int _mm_comi_round_ss(__m128 __A, __m128 __B, const int __P, + const int __R) { return __builtin_ia32_vcomiss((__v4sf)__A, (__v4sf)__B, __P, __R); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comi_round_sd(__m128d __A, __m128d __B, const int __P, const int __R) { +__funline int _mm_comi_round_sd(__m128d __A, __m128d __B, const int __P, + const int __R) { return __builtin_ia32_vcomisd((__v2df)__A, (__v2df)__B, __P, __R); } #else @@ -9605,1329 +7980,1065 @@ extern __inline int #define _mm_comi_round_sd(A, B, C, D) __builtin_ia32_vcomisd(A, B, C, D) #endif -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sqrt_pd(__m512d __A) { +__funline __m512d _mm512_sqrt_pd(__m512d __A) { return (__m512d)__builtin_ia32_sqrtpd512_mask( (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sqrt_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_sqrt_pd(__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_sqrtpd512_mask( (__v8df)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sqrt_pd(__mmask8 __U, __m512d __A) { +__funline __m512d _mm512_maskz_sqrt_pd(__mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_sqrtpd512_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sqrt_ps(__m512 __A) { +__funline __m512 _mm512_sqrt_ps(__m512 __A) { return (__m512)__builtin_ia32_sqrtps512_mask( (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_sqrtps512_mask( (__v16sf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sqrt_ps(__mmask16 __U, __m512 __A) { +__funline __m512 _mm512_maskz_sqrt_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_sqrtps512_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_add_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_add_pd(__m512d __A, __m512d __B) { return (__m512d)((__v8df)__A + (__v8df)__B); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_addpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_addpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_add_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_add_ps(__m512 __A, __m512 __B) { return (__m512)((__v16sf)__A + (__v16sf)__B); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_addps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_addps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_addsd_mask_round((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_addsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_addss_mask_round((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_addss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sub_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_sub_pd(__m512d __A, __m512d __B) { return (__m512d)((__v8df)__A - (__v8df)__B); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_subpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_subpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_sub_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_sub_ps(__m512 __A, __m512 __B) { return (__m512)((__v16sf)__A - (__v16sf)__B); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_subps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_subps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_subsd_mask_round((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_subsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_subss_mask_round((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_subss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mul_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_mul_pd(__m512d __A, __m512d __B) { return (__m512d)((__v8df)__A * (__v8df)__B); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_mulpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_mulpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mul_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_mul_ps(__m512 __A, __m512 __B) { return (__m512)((__v16sf)__A * (__v16sf)__B); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_mulps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_mulps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_mulsd_mask_round((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_mulsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_mulss_mask_round((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_mulss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_div_pd(__m512d __M, __m512d __V) { +__funline __m512d _mm512_div_pd(__m512d __M, __m512d __V) { return (__m512d)((__v8df)__M / (__v8df)__V); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) { +__funline __m512d _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V) { return (__m512d)__builtin_ia32_divpd512_mask((__v8df)__M, (__v8df)__V, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_div_pd(__mmask8 __U, __m512d __M, __m512d __V) { +__funline __m512d _mm512_maskz_div_pd(__mmask8 __U, __m512d __M, __m512d __V) { return (__m512d)__builtin_ia32_divpd512_mask( (__v8df)__M, (__v8df)__V, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_div_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_div_ps(__m512 __A, __m512 __B) { return (__m512)((__v16sf)__A / (__v16sf)__B); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_divps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_divps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_divsd_mask_round((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_divsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_divss_mask_round((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_divss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_max_pd(__m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_maxpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_max_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_max_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_maxpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_max_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_maxpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_max_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_max_ps(__m512 __A, __m512 __B) { return (__m512)__builtin_ia32_maxps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_max_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_max_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_maxps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_max_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_max_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_maxps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_max_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_maxsd_mask_round((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_sd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_max_sd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_maxsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_max_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_maxss_mask_round((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_ss(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_max_ss(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_maxss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_min_pd(__m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_minpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_min_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_min_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_minpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_min_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_minpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_min_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_min_ps(__m512 __A, __m512 __B) { return (__m512)__builtin_ia32_minps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_min_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_min_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_minps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_min_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_min_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_minps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_min_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_minsd_mask_round((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_sd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_min_sd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_minsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_min_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_minss_mask_round((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_ss(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_min_ss(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_minss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_scalef_pd(__m512d __A, __m512d __B) { +__funline __m512d _mm512_scalef_pd(__m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_scalefpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_scalef_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_mask_scalef_pd(__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) { return (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_scalef_pd(__mmask8 __U, __m512d __A, __m512d __B) { +__funline __m512d _mm512_maskz_scalef_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_scalefpd512_mask( (__v8df)__A, (__v8df)__B, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_scalef_ps(__m512 __A, __m512 __B) { +__funline __m512 _mm512_scalef_ps(__m512 __A, __m512 __B) { return (__m512)__builtin_ia32_scalefps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_scalef_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_mask_scalef_ps(__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) { return (__m512)__builtin_ia32_scalefps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_scalef_ps(__mmask16 __U, __m512 __A, __m512 __B) { +__funline __m512 _mm512_maskz_scalef_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_scalefps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_scalef_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_scalef_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_scalefsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_scalef_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_scalef_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_scalefss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) { return (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { +__funline __m512d _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { +__funline __m512d _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U) { return (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) { return (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { +__funline __m512 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { +__funline __m512 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U) { return (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) { return (__m512d)__builtin_ia32_vfmsubpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { +__funline __m512d _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfmsubpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { +__funline __m512d _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U) { return (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfmsubpd512_maskz((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) { return (__m512)__builtin_ia32_vfmsubps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { +__funline __m512 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfmsubps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { +__funline __m512 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U) { return (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfmsubps512_maskz((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) { return (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { +__funline __m512d _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { +__funline __m512d _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U) { return (__m512d)__builtin_ia32_vfmaddsubpd512_mask3( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) { return (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { +__funline __m512 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfmaddsubps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { +__funline __m512 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U) { return (__m512)__builtin_ia32_vfmaddsubps512_mask3( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfmaddsubps512_maskz( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) { return (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { +__funline __m512d _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfmaddsubpd512_mask( (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { +__funline __m512d _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U) { return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3( (__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfmaddsubpd512_maskz( (__v8df)__A, (__v8df)__B, -(__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) { return (__m512)__builtin_ia32_vfmaddsubps512_mask( (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { +__funline __m512 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfmaddsubps512_mask( (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { +__funline __m512 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U) { return (__m512)__builtin_ia32_vfmsubaddps512_mask3( (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfmaddsubps512_maskz( (__v16sf)__A, (__v16sf)__B, -(__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) { return (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { +__funline __m512d _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { +__funline __m512d _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U) { return (__m512d)__builtin_ia32_vfnmaddpd512_mask3((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfnmaddpd512_maskz((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) { return (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { +__funline __m512 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { +__funline __m512 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U) { return (__m512)__builtin_ia32_vfnmaddps512_mask3((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfnmaddps512_maskz((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) { return (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { +__funline __m512d _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { +__funline __m512d _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U) { return (__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { +__funline __m512d _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C) { return (__m512d)__builtin_ia32_vfnmsubpd512_maskz((__v8df)__A, (__v8df)__B, (__v8df)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) { return (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { +__funline __m512 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { +__funline __m512 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U) { return (__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { +__funline __m512 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C) { return (__m512)__builtin_ia32_vfnmsubps512_maskz((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvttpd_epi32(__m512d __A) { +__funline __m256i _mm512_cvttpd_epi32(__m512d __A) { return (__m256i)__builtin_ia32_cvttpd2dq512_mask( (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvttpd_epi32(__m256i __W, __mmask8 __U, __m512d __A) { +__funline __m256i _mm512_mask_cvttpd_epi32(__m256i __W, __mmask8 __U, + __m512d __A) { return (__m256i)__builtin_ia32_cvttpd2dq512_mask( (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvttpd_epi32(__mmask8 __U, __m512d __A) { +__funline __m256i _mm512_maskz_cvttpd_epi32(__mmask8 __U, __m512d __A) { return (__m256i)__builtin_ia32_cvttpd2dq512_mask( (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvttpd_epu32(__m512d __A) { +__funline __m256i _mm512_cvttpd_epu32(__m512d __A) { return (__m256i)__builtin_ia32_cvttpd2udq512_mask( (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvttpd_epu32(__m256i __W, __mmask8 __U, __m512d __A) { +__funline __m256i _mm512_mask_cvttpd_epu32(__m256i __W, __mmask8 __U, + __m512d __A) { return (__m256i)__builtin_ia32_cvttpd2udq512_mask( (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvttpd_epu32(__mmask8 __U, __m512d __A) { +__funline __m256i _mm512_maskz_cvttpd_epu32(__mmask8 __U, __m512d __A) { return (__m256i)__builtin_ia32_cvttpd2udq512_mask( (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtpd_epi32(__m512d __A) { +__funline __m256i _mm512_cvtpd_epi32(__m512d __A) { return (__m256i)__builtin_ia32_cvtpd2dq512_mask( (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtpd_epi32(__m256i __W, __mmask8 __U, __m512d __A) { +__funline __m256i _mm512_mask_cvtpd_epi32(__m256i __W, __mmask8 __U, + __m512d __A) { return (__m256i)__builtin_ia32_cvtpd2dq512_mask( (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtpd_epi32(__mmask8 __U, __m512d __A) { +__funline __m256i _mm512_maskz_cvtpd_epi32(__mmask8 __U, __m512d __A) { return (__m256i)__builtin_ia32_cvtpd2dq512_mask( (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtpd_epu32(__m512d __A) { +__funline __m256i _mm512_cvtpd_epu32(__m512d __A) { return (__m256i)__builtin_ia32_cvtpd2udq512_mask( (__v8df)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtpd_epu32(__m256i __W, __mmask8 __U, __m512d __A) { +__funline __m256i _mm512_mask_cvtpd_epu32(__m256i __W, __mmask8 __U, + __m512d __A) { return (__m256i)__builtin_ia32_cvtpd2udq512_mask( (__v8df)__A, (__v8si)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtpd_epu32(__mmask8 __U, __m512d __A) { +__funline __m256i _mm512_maskz_cvtpd_epu32(__mmask8 __U, __m512d __A) { return (__m256i)__builtin_ia32_cvtpd2udq512_mask( (__v8df)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvttps_epi32(__m512 __A) { +__funline __m512i _mm512_cvttps_epi32(__m512 __A) { return (__m512i)__builtin_ia32_cvttps2dq512_mask( (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvttps_epi32(__m512i __W, __mmask16 __U, __m512 __A) { +__funline __m512i _mm512_mask_cvttps_epi32(__m512i __W, __mmask16 __U, + __m512 __A) { return (__m512i)__builtin_ia32_cvttps2dq512_mask( (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvttps_epi32(__mmask16 __U, __m512 __A) { +__funline __m512i _mm512_maskz_cvttps_epi32(__mmask16 __U, __m512 __A) { return (__m512i)__builtin_ia32_cvttps2dq512_mask( (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvttps_epu32(__m512 __A) { +__funline __m512i _mm512_cvttps_epu32(__m512 __A) { return (__m512i)__builtin_ia32_cvttps2udq512_mask( (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvttps_epu32(__m512i __W, __mmask16 __U, __m512 __A) { +__funline __m512i _mm512_mask_cvttps_epu32(__m512i __W, __mmask16 __U, + __m512 __A) { return (__m512i)__builtin_ia32_cvttps2udq512_mask( (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvttps_epu32(__mmask16 __U, __m512 __A) { +__funline __m512i _mm512_maskz_cvttps_epu32(__mmask16 __U, __m512 __A) { return (__m512i)__builtin_ia32_cvttps2udq512_mask( (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtps_epi32(__m512 __A) { +__funline __m512i _mm512_cvtps_epi32(__m512 __A) { return (__m512i)__builtin_ia32_cvtps2dq512_mask( (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtps_epi32(__m512i __W, __mmask16 __U, __m512 __A) { +__funline __m512i _mm512_mask_cvtps_epi32(__m512i __W, __mmask16 __U, + __m512 __A) { return (__m512i)__builtin_ia32_cvtps2dq512_mask( (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtps_epi32(__mmask16 __U, __m512 __A) { +__funline __m512i _mm512_maskz_cvtps_epi32(__mmask16 __U, __m512 __A) { return (__m512i)__builtin_ia32_cvtps2dq512_mask( (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtps_epu32(__m512 __A) { +__funline __m512i _mm512_cvtps_epu32(__m512 __A) { return (__m512i)__builtin_ia32_cvtps2udq512_mask( (__v16sf)__A, (__v16si)_mm512_undefined_epi32(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtps_epu32(__m512i __W, __mmask16 __U, __m512 __A) { +__funline __m512i _mm512_mask_cvtps_epu32(__m512i __W, __mmask16 __U, + __m512 __A) { return (__m512i)__builtin_ia32_cvtps2udq512_mask( (__v16sf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtps_epu32(__mmask16 __U, __m512 __A) { +__funline __m512i _mm512_maskz_cvtps_epu32(__mmask16 __U, __m512 __A) { return (__m512i)__builtin_ia32_cvtps2udq512_mask( (__v16sf)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtsd_f64(__m512d __A) { +__funline double _mm512_cvtsd_f64(__m512d __A) { return __A[0]; } -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtss_f32(__m512 __A) { +__funline float _mm512_cvtss_f32(__m512 __A) { return __A[0]; } #ifdef __x86_64__ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtu64_ss(__m128 __A, unsigned long long __B) { +__funline __m128 _mm_cvtu64_ss(__m128 __A, unsigned long long __B) { return (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)__A, __B, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtu64_sd(__m128d __A, unsigned long long __B) { +__funline __m128d _mm_cvtu64_sd(__m128d __A, unsigned long long __B) { return (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)__A, __B, _MM_FROUND_CUR_DIRECTION); } #endif -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtu32_ss(__m128 __A, unsigned __B) { +__funline __m128 _mm_cvtu32_ss(__m128 __A, unsigned __B) { return (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)__A, __B, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepi32_ps(__m512i __A) { +__funline __m512 _mm512_cvtepi32_ps(__m512i __A) { return (__m512)__builtin_ia32_cvtdq2ps512_mask( (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepi32_ps(__m512 __W, __mmask16 __U, __m512i __A) { +__funline __m512 _mm512_mask_cvtepi32_ps(__m512 __W, __mmask16 __U, __m512i __A) { return (__m512)__builtin_ia32_cvtdq2ps512_mask( (__v16si)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepi32_ps(__mmask16 __U, __m512i __A) { +__funline __m512 _mm512_maskz_cvtepi32_ps(__mmask16 __U, __m512i __A) { return (__m512)__builtin_ia32_cvtdq2ps512_mask( (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtepu32_ps(__m512i __A) { +__funline __m512 _mm512_cvtepu32_ps(__m512i __A) { return (__m512)__builtin_ia32_cvtudq2ps512_mask( (__v16si)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtepu32_ps(__m512 __W, __mmask16 __U, __m512i __A) { +__funline __m512 _mm512_mask_cvtepu32_ps(__m512 __W, __mmask16 __U, __m512i __A) { return (__m512)__builtin_ia32_cvtudq2ps512_mask( (__v16si)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtepu32_ps(__mmask16 __U, __m512i __A) { +__funline __m512 _mm512_maskz_cvtepu32_ps(__mmask16 __U, __m512i __A) { return (__m512)__builtin_ia32_cvtudq2ps512_mask( (__v16si)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } #ifdef __OPTIMIZE__ -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fixupimm_pd(__m512d __A, __m512d __B, __m512i __C, const int __imm) { +__funline __m512d _mm512_fixupimm_pd(__m512d __A, __m512d __B, __m512i __C, + const int __imm) { return (__m512d)__builtin_ia32_fixupimmpd512_mask( (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fixupimm_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512i __C, - const int __imm) { +__funline __m512d _mm512_mask_fixupimm_pd(__m512d __A, __mmask8 __U, __m512d __B, + __m512i __C, const int __imm) { return (__m512d)__builtin_ia32_fixupimmpd512_mask( (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fixupimm_pd(__mmask8 __U, __m512d __A, __m512d __B, - __m512i __C, const int __imm) { +__funline __m512d _mm512_maskz_fixupimm_pd(__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm) { return (__m512d)__builtin_ia32_fixupimmpd512_maskz( (__v8df)__A, (__v8df)__B, (__v8di)__C, __imm, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_fixupimm_ps(__m512 __A, __m512 __B, __m512i __C, const int __imm) { +__funline __m512 _mm512_fixupimm_ps(__m512 __A, __m512 __B, __m512i __C, + const int __imm) { return (__m512)__builtin_ia32_fixupimmps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_fixupimm_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512i __C, - const int __imm) { +__funline __m512 _mm512_mask_fixupimm_ps(__m512 __A, __mmask16 __U, __m512 __B, + __m512i __C, const int __imm) { return (__m512)__builtin_ia32_fixupimmps512_mask( (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_fixupimm_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512i __C, - const int __imm) { +__funline __m512 _mm512_maskz_fixupimm_ps(__mmask16 __U, __m512 __A, __m512 __B, + __m512i __C, const int __imm) { return (__m512)__builtin_ia32_fixupimmps512_maskz( (__v16sf)__A, (__v16sf)__B, (__v16si)__C, __imm, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fixupimm_sd(__m128d __A, __m128d __B, __m128i __C, const int __imm) { +__funline __m128d _mm_fixupimm_sd(__m128d __A, __m128d __B, __m128i __C, + const int __imm) { return (__m128d)__builtin_ia32_fixupimmsd_mask( (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fixupimm_sd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C, - const int __imm) { +__funline __m128d _mm_mask_fixupimm_sd(__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm) { return (__m128d)__builtin_ia32_fixupimmsd_mask( (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fixupimm_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C, - const int __imm) { +__funline __m128d _mm_maskz_fixupimm_sd(__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm) { return (__m128d)__builtin_ia32_fixupimmsd_maskz( (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fixupimm_ss(__m128 __A, __m128 __B, __m128i __C, const int __imm) { +__funline __m128 _mm_fixupimm_ss(__m128 __A, __m128 __B, __m128i __C, + const int __imm) { return (__m128)__builtin_ia32_fixupimmss_mask( (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fixupimm_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C, - const int __imm) { +__funline __m128 _mm_mask_fixupimm_ss(__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm) { return (__m128)__builtin_ia32_fixupimmss_mask( (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fixupimm_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C, - const int __imm) { +__funline __m128 _mm_maskz_fixupimm_ss(__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm) { return (__m128)__builtin_ia32_fixupimmss_maskz( (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); @@ -10995,364 +9106,278 @@ extern __inline __m128 #endif #ifdef __x86_64__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtss_u64(__m128 __A) { +__funline unsigned long long _mm_cvtss_u64(__m128 __A) { return (unsigned long long)__builtin_ia32_vcvtss2usi64( (__v4sf)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttss_u64(__m128 __A) { +__funline unsigned long long _mm_cvttss_u64(__m128 __A) { return (unsigned long long)__builtin_ia32_vcvttss2usi64( (__v4sf)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttss_i64(__m128 __A) { +__funline long long _mm_cvttss_i64(__m128 __A) { return (long long)__builtin_ia32_vcvttss2si64((__v4sf)__A, _MM_FROUND_CUR_DIRECTION); } #endif /* __x86_64__ */ -extern __inline unsigned - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtss_u32(__m128 __A) { +__funline unsigned _mm_cvtss_u32(__m128 __A) { return (unsigned)__builtin_ia32_vcvtss2usi32((__v4sf)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline unsigned - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttss_u32(__m128 __A) { +__funline unsigned _mm_cvttss_u32(__m128 __A) { return (unsigned)__builtin_ia32_vcvttss2usi32((__v4sf)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttss_i32(__m128 __A) { +__funline int _mm_cvttss_i32(__m128 __A) { return (int)__builtin_ia32_vcvttss2si32((__v4sf)__A, _MM_FROUND_CUR_DIRECTION); } #ifdef __x86_64__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsd_u64(__m128d __A) { +__funline unsigned long long _mm_cvtsd_u64(__m128d __A) { return (unsigned long long)__builtin_ia32_vcvtsd2usi64( (__v2df)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttsd_u64(__m128d __A) { +__funline unsigned long long _mm_cvttsd_u64(__m128d __A) { return (unsigned long long)__builtin_ia32_vcvttsd2usi64( (__v2df)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttsd_i64(__m128d __A) { +__funline long long _mm_cvttsd_i64(__m128d __A) { return (long long)__builtin_ia32_vcvttsd2si64((__v2df)__A, _MM_FROUND_CUR_DIRECTION); } #endif /* __x86_64__ */ -extern __inline unsigned - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsd_u32(__m128d __A) { +__funline unsigned _mm_cvtsd_u32(__m128d __A) { return (unsigned)__builtin_ia32_vcvtsd2usi32((__v2df)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline unsigned - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttsd_u32(__m128d __A) { +__funline unsigned _mm_cvttsd_u32(__m128d __A) { return (unsigned)__builtin_ia32_vcvttsd2usi32((__v2df)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttsd_i32(__m128d __A) { +__funline int _mm_cvttsd_i32(__m128d __A) { return (int)__builtin_ia32_vcvttsd2si32((__v2df)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtps_pd(__m256 __A) { +__funline __m512d _mm512_cvtps_pd(__m256 __A) { return (__m512d)__builtin_ia32_cvtps2pd512_mask( (__v8sf)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtps_pd(__m512d __W, __mmask8 __U, __m256 __A) { +__funline __m512d _mm512_mask_cvtps_pd(__m512d __W, __mmask8 __U, __m256 __A) { return (__m512d)__builtin_ia32_cvtps2pd512_mask( (__v8sf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) { +__funline __m512d _mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) { return (__m512d)__builtin_ia32_cvtps2pd512_mask( (__v8sf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtph_ps(__m256i __A) { +__funline __m512 _mm512_cvtph_ps(__m256i __A) { return (__m512)__builtin_ia32_vcvtph2ps512_mask( (__v16hi)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtph_ps(__m512 __W, __mmask16 __U, __m256i __A) { +__funline __m512 _mm512_mask_cvtph_ps(__m512 __W, __mmask16 __U, __m256i __A) { return (__m512)__builtin_ia32_vcvtph2ps512_mask( (__v16hi)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtph_ps(__mmask16 __U, __m256i __A) { +__funline __m512 _mm512_maskz_cvtph_ps(__mmask16 __U, __m256i __A) { return (__m512)__builtin_ia32_vcvtph2ps512_mask( (__v16hi)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cvtpd_ps(__m512d __A) { +__funline __m256 _mm512_cvtpd_ps(__m512d __A) { return (__m256)__builtin_ia32_cvtpd2ps512_mask( (__v8df)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cvtpd_ps(__m256 __W, __mmask8 __U, __m512d __A) { +__funline __m256 _mm512_mask_cvtpd_ps(__m256 __W, __mmask8 __U, __m512d __A) { return (__m256)__builtin_ia32_cvtpd2ps512_mask( (__v8df)__A, (__v8sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_cvtpd_ps(__mmask8 __U, __m512d __A) { +__funline __m256 _mm512_maskz_cvtpd_ps(__mmask8 __U, __m512d __A) { return (__m256)__builtin_ia32_cvtpd2ps512_mask( (__v8df)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } #ifdef __OPTIMIZE__ -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_getexp_ps(__m512 __A) { +__funline __m512 _mm512_getexp_ps(__m512 __A) { return (__m512)__builtin_ia32_getexpps512_mask( (__v16sf)__A, (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_getexp_ps(__m512 __W, __mmask16 __U, __m512 __A) { +__funline __m512 _mm512_mask_getexp_ps(__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_getexpps512_mask( (__v16sf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_getexp_ps(__mmask16 __U, __m512 __A) { +__funline __m512 _mm512_maskz_getexp_ps(__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_getexpps512_mask( (__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_getexp_pd(__m512d __A) { +__funline __m512d _mm512_getexp_pd(__m512d __A) { return (__m512d)__builtin_ia32_getexppd512_mask( (__v8df)__A, (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_getexp_pd(__m512d __W, __mmask8 __U, __m512d __A) { +__funline __m512d _mm512_mask_getexp_pd(__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_getexppd512_mask( (__v8df)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_getexp_pd(__mmask8 __U, __m512d __A) { +__funline __m512d _mm512_maskz_getexp_pd(__mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_getexppd512_mask( (__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getexp_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_getexp_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_getexpss128_round((__v4sf)__A, (__v4sf)__B, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getexp_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_getexp_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_getexpss_mask_round((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getexp_ss(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_getexp_ss(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_getexpss_mask_round( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getexp_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_getexp_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_getexpsd128_round((__v2df)__A, (__v2df)__B, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getexp_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_getexp_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_getexpsd_mask_round((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getexp_sd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_getexp_sd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_getexpsd_mask_round( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_getmant_pd(__m512d __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m512d _mm512_getmant_pd(__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m512d)__builtin_ia32_getmantpd512_mask( (__v8df)__A, (__C << 2) | __B, _mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_getmant_pd(__m512d __W, __mmask8 __U, __m512d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m512d _mm512_mask_getmant_pd(__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m512d)__builtin_ia32_getmantpd512_mask( (__v8df)__A, (__C << 2) | __B, (__v8df)__W, __U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_getmant_pd(__mmask8 __U, __m512d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m512d _mm512_maskz_getmant_pd(__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m512d)__builtin_ia32_getmantpd512_mask( (__v8df)__A, (__C << 2) | __B, (__v8df)_mm512_setzero_pd(), __U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_getmant_ps(__m512 __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m512 _mm512_getmant_ps(__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m512)__builtin_ia32_getmantps512_mask( (__v16sf)__A, (__C << 2) | __B, _mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_getmant_ps(__m512 __W, __mmask16 __U, __m512 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m512 _mm512_mask_getmant_ps(__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m512)__builtin_ia32_getmantps512_mask( (__v16sf)__A, (__C << 2) | __B, (__v16sf)__W, __U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_getmant_ps(__mmask16 __U, __m512 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m512 _mm512_maskz_getmant_ps(__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m512)__builtin_ia32_getmantps512_mask( (__v16sf)__A, (__C << 2) | __B, (__v16sf)_mm512_setzero_ps(), __U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getmant_sd(__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { +__funline __m128d _mm_getmant_sd(__m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { return (__m128d)__builtin_ia32_getmantsd_round( (__v2df)__A, (__v2df)__B, (__D << 2) | __C, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getmant_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { +__funline __m128d _mm_mask_getmant_sd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { return (__m128d)__builtin_ia32_getmantsd_mask_round( (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)__W, __U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getmant_sd(__mmask8 __U, __m128d __A, __m128d __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { +__funline __m128d _mm_maskz_getmant_sd(__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { return (__m128d)__builtin_ia32_getmantsd_mask_round( (__v2df)__A, (__v2df)__B, (__D << 2) | __C, (__v2df)_mm_setzero_pd(), __U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getmant_ss(__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { +__funline __m128 _mm_getmant_ss(__m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { return (__m128)__builtin_ia32_getmantss_round( (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getmant_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { +__funline __m128 _mm_mask_getmant_ss(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { return (__m128)__builtin_ia32_getmantss_mask_round( (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)__W, __U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getmant_ss(__mmask8 __U, __m128 __A, __m128 __B, - _MM_MANTISSA_NORM_ENUM __C, - _MM_MANTISSA_SIGN_ENUM __D) { +__funline __m128 _mm_maskz_getmant_ss(__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) { return (__m128)__builtin_ia32_getmantss_mask_round( (__v4sf)__A, (__v4sf)__B, (__D << 2) | __C, (__v4sf)_mm_setzero_ps(), __U, _MM_FROUND_CUR_DIRECTION); @@ -11474,65 +9499,51 @@ extern __inline __m128 #endif #ifdef __OPTIMIZE__ -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_roundscale_ps(__m512 __A, const int __imm) { +__funline __m512 _mm512_roundscale_ps(__m512 __A, const int __imm) { return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__A, __imm, (__v16sf)_mm512_undefined_ps(), -1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_roundscale_ps(__m512 __A, __mmask16 __B, __m512 __C, - const int __imm) { +__funline __m512 _mm512_mask_roundscale_ps(__m512 __A, __mmask16 __B, __m512 __C, + const int __imm) { return (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)__C, __imm, (__v16sf)__A, (__mmask16)__B, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_roundscale_ps(__mmask16 __A, __m512 __B, const int __imm) { +__funline __m512 _mm512_maskz_roundscale_ps(__mmask16 __A, __m512 __B, + const int __imm) { return (__m512)__builtin_ia32_rndscaleps_mask( (__v16sf)__B, __imm, (__v16sf)_mm512_setzero_ps(), (__mmask16)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_roundscale_pd(__m512d __A, const int __imm) { +__funline __m512d _mm512_roundscale_pd(__m512d __A, const int __imm) { return (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)__A, __imm, (__v8df)_mm512_undefined_pd(), -1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_roundscale_pd(__m512d __A, __mmask8 __B, __m512d __C, - const int __imm) { +__funline __m512d _mm512_mask_roundscale_pd(__m512d __A, __mmask8 __B, + __m512d __C, const int __imm) { return (__m512d)__builtin_ia32_rndscalepd_mask( (__v8df)__C, __imm, (__v8df)__A, (__mmask8)__B, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_roundscale_pd(__mmask8 __A, __m512d __B, const int __imm) { +__funline __m512d _mm512_maskz_roundscale_pd(__mmask8 __A, __m512d __B, + const int __imm) { return (__m512d)__builtin_ia32_rndscalepd_mask( (__v8df)__B, __imm, (__v8df)_mm512_setzero_pd(), (__mmask8)__A, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_roundscale_ss(__m128 __A, __m128 __B, const int __imm) { +__funline __m128 _mm_roundscale_ss(__m128 __A, __m128 __B, const int __imm) { return (__m128)__builtin_ia32_rndscaless_round( (__v4sf)__A, (__v4sf)__B, __imm, _MM_FROUND_CUR_DIRECTION); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_roundscale_sd(__m128d __A, __m128d __B, const int __imm) { +__funline __m128d _mm_roundscale_sd(__m128d __A, __m128d __B, const int __imm) { return (__m128d)__builtin_ia32_rndscalesd_round( (__v2df)__A, (__v2df)__B, __imm, _MM_FROUND_CUR_DIRECTION); } @@ -11573,315 +9584,255 @@ extern __inline __m128d #endif #ifdef __OPTIMIZE__ -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_pd_mask(__m512d __X, __m512d __Y, const int __P) { +__funline __mmask8 _mm512_cmp_pd_mask(__m512d __X, __m512d __Y, const int __P) { return (__mmask8)__builtin_ia32_cmppd512_mask( (__v8df)__X, (__v8df)__Y, __P, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmp_ps_mask(__m512 __X, __m512 __Y, const int __P) { +__funline __mmask16 _mm512_cmp_ps_mask(__m512 __X, __m512 __Y, const int __P) { return (__mmask16)__builtin_ia32_cmpps512_mask( (__v16sf)__X, (__v16sf)__Y, __P, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_cmp_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y, const int __P) { +__funline __mmask16 _mm512_mask_cmp_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y, + const int __P) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, __P, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_cmp_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y, const int __P) { +__funline __mmask8 _mm512_mask_cmp_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmppd512_mask( (__v8df)__X, (__v8df)__Y, __P, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpeq_pd_mask(__m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_cmpeq_pd_mask(__m512d __X, __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_EQ_OQ, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpeq_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_mask_cmpeq_pd_mask(__mmask8 __U, __m512d __X, + __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_EQ_OQ, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmplt_pd_mask(__m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_cmplt_pd_mask(__m512d __X, __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_LT_OS, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmplt_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_mask_cmplt_pd_mask(__mmask8 __U, __m512d __X, + __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_LT_OS, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmple_pd_mask(__m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_cmple_pd_mask(__m512d __X, __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_LE_OS, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmple_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_mask_cmple_pd_mask(__mmask8 __U, __m512d __X, + __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_LE_OS, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpunord_pd_mask(__m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_cmpunord_pd_mask(__m512d __X, __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_UNORD_Q, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpunord_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_mask_cmpunord_pd_mask(__mmask8 __U, __m512d __X, + __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_UNORD_Q, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpneq_pd_mask(__m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_cmpneq_pd_mask(__m512d __X, __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_NEQ_UQ, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpneq_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_mask_cmpneq_pd_mask(__mmask8 __U, __m512d __X, + __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_NEQ_UQ, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpnlt_pd_mask(__m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_cmpnlt_pd_mask(__m512d __X, __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_NLT_US, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpnlt_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_mask_cmpnlt_pd_mask(__mmask8 __U, __m512d __X, + __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_NLT_US, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpnle_pd_mask(__m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_cmpnle_pd_mask(__m512d __X, __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_NLE_US, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpnle_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_mask_cmpnle_pd_mask(__mmask8 __U, __m512d __X, + __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_NLE_US, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpord_pd_mask(__m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_cmpord_pd_mask(__m512d __X, __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_ORD_Q, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpord_pd_mask(__mmask8 __U, __m512d __X, __m512d __Y) { +__funline __mmask8 _mm512_mask_cmpord_pd_mask(__mmask8 __U, __m512d __X, + __m512d __Y) { return (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)__X, (__v8df)__Y, _CMP_ORD_Q, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpeq_ps_mask(__m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_cmpeq_ps_mask(__m512 __X, __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_EQ_OQ, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpeq_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_mask_cmpeq_ps_mask(__mmask16 __U, __m512 __X, + __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_EQ_OQ, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmplt_ps_mask(__m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_cmplt_ps_mask(__m512 __X, __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_LT_OS, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmplt_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_mask_cmplt_ps_mask(__mmask16 __U, __m512 __X, + __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_LT_OS, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmple_ps_mask(__m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_cmple_ps_mask(__m512 __X, __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_LE_OS, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmple_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_mask_cmple_ps_mask(__mmask16 __U, __m512 __X, + __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_LE_OS, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpunord_ps_mask(__m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_cmpunord_ps_mask(__m512 __X, __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_UNORD_Q, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpunord_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_mask_cmpunord_ps_mask(__mmask16 __U, __m512 __X, + __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_UNORD_Q, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpneq_ps_mask(__m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_cmpneq_ps_mask(__m512 __X, __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_NEQ_UQ, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpneq_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_mask_cmpneq_ps_mask(__mmask16 __U, __m512 __X, + __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_NEQ_UQ, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpnlt_ps_mask(__m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_cmpnlt_ps_mask(__m512 __X, __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_NLT_US, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpnlt_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_mask_cmpnlt_ps_mask(__mmask16 __U, __m512 __X, + __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_NLT_US, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpnle_ps_mask(__m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_cmpnle_ps_mask(__m512 __X, __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_NLE_US, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpnle_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_mask_cmpnle_ps_mask(__mmask16 __U, __m512 __X, + __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_NLE_US, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpord_ps_mask(__m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_cmpord_ps_mask(__m512 __X, __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_ORD_Q, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpord_ps_mask(__mmask16 __U, __m512 __X, __m512 __Y) { +__funline __mmask16 _mm512_mask_cmpord_ps_mask(__mmask16 __U, __m512 __X, + __m512 __Y) { return (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)__X, (__v16sf)__Y, _CMP_ORD_Q, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_sd_mask(__m128d __X, __m128d __Y, const int __P) { +__funline __mmask8 _mm_cmp_sd_mask(__m128d __X, __m128d __Y, const int __P) { return (__mmask8)__builtin_ia32_cmpsd_mask( (__v2df)__X, (__v2df)__Y, __P, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_cmp_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y, const int __P) { +__funline __mmask8 _mm_mask_cmp_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmpsd_mask( (__v2df)__X, (__v2df)__Y, __P, (__mmask8)__M, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_ss_mask(__m128 __X, __m128 __Y, const int __P) { +__funline __mmask8 _mm_cmp_ss_mask(__m128 __X, __m128 __Y, const int __P) { return (__mmask8)__builtin_ia32_cmpss_mask( (__v4sf)__X, (__v4sf)__Y, __P, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmp_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y, const int __P) { +__funline __mmask8 _mm_mask_cmp_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmpss_mask( (__v4sf)__X, (__v4sf)__Y, __P, (__mmask8)__M, _MM_FROUND_CUR_DIRECTION); } @@ -11928,172 +9879,122 @@ extern __inline __mmask8 _MM_FROUND_CUR_DIRECTION)) #endif -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_kmov(__mmask16 __A) { +__funline __mmask16 _mm512_kmov(__mmask16 __A) { return __builtin_ia32_kmovw(__A); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castpd_ps(__m512d __A) { +__funline __m512 _mm512_castpd_ps(__m512d __A) { return (__m512)(__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castpd_si512(__m512d __A) { +__funline __m512i _mm512_castpd_si512(__m512d __A) { return (__m512i)(__A); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castps_pd(__m512 __A) { +__funline __m512d _mm512_castps_pd(__m512 __A) { return (__m512d)(__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castps_si512(__m512 __A) { +__funline __m512i _mm512_castps_si512(__m512 __A) { return (__m512i)(__A); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castsi512_ps(__m512i __A) { +__funline __m512 _mm512_castsi512_ps(__m512i __A) { return (__m512)(__A); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castsi512_pd(__m512i __A) { +__funline __m512d _mm512_castsi512_pd(__m512i __A) { return (__m512d)(__A); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castpd512_pd128(__m512d __A) { +__funline __m128d _mm512_castpd512_pd128(__m512d __A) { return (__m128d)_mm512_extractf32x4_ps((__m512)__A, 0); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castps512_ps128(__m512 __A) { +__funline __m128 _mm512_castps512_ps128(__m512 __A) { return _mm512_extractf32x4_ps(__A, 0); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castsi512_si128(__m512i __A) { +__funline __m128i _mm512_castsi512_si128(__m512i __A) { return (__m128i)_mm512_extracti32x4_epi32((__m512i)__A, 0); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castpd512_pd256(__m512d __A) { +__funline __m256d _mm512_castpd512_pd256(__m512d __A) { return _mm512_extractf64x4_pd(__A, 0); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castps512_ps256(__m512 __A) { +__funline __m256 _mm512_castps512_ps256(__m512 __A) { return (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castsi512_si256(__m512i __A) { +__funline __m256i _mm512_castsi512_si256(__m512i __A) { return (__m256i)_mm512_extractf64x4_pd((__m512d)__A, 0); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castpd128_pd512(__m128d __A) { +__funline __m512d _mm512_castpd128_pd512(__m128d __A) { return (__m512d)__builtin_ia32_pd512_pd((__m128d)__A); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castps128_ps512(__m128 __A) { +__funline __m512 _mm512_castps128_ps512(__m128 __A) { return (__m512)__builtin_ia32_ps512_ps((__m128)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castsi128_si512(__m128i __A) { +__funline __m512i _mm512_castsi128_si512(__m128i __A) { return (__m512i)__builtin_ia32_si512_si((__v4si)__A); } -extern __inline __m512d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castpd256_pd512(__m256d __A) { +__funline __m512d _mm512_castpd256_pd512(__m256d __A) { return __builtin_ia32_pd512_256pd(__A); } -extern __inline __m512 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castps256_ps512(__m256 __A) { +__funline __m512 _mm512_castps256_ps512(__m256 __A) { return __builtin_ia32_ps512_256ps(__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_castsi256_si512(__m256i __A) { +__funline __m512i _mm512_castsi256_si512(__m256i __A) { return (__m512i)__builtin_ia32_si512_256si((__v8si)__A); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpeq_epu32_mask(__m512i __A, __m512i __B) { +__funline __mmask16 _mm512_cmpeq_epu32_mask(__m512i __A, __m512i __B) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 0, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpeq_epu32_mask(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __mmask16 _mm512_mask_cmpeq_epu32_mask(__mmask16 __U, __m512i __A, + __m512i __B) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 0, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpeq_epu64_mask(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __mmask8 _mm512_mask_cmpeq_epu64_mask(__mmask8 __U, __m512i __A, + __m512i __B) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 0, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpeq_epu64_mask(__m512i __A, __m512i __B) { +__funline __mmask8 _mm512_cmpeq_epu64_mask(__m512i __A, __m512i __B) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 0, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpgt_epu32_mask(__m512i __A, __m512i __B) { +__funline __mmask16 _mm512_cmpgt_epu32_mask(__m512i __A, __m512i __B) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 6, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpgt_epu32_mask(__mmask16 __U, __m512i __A, __m512i __B) { +__funline __mmask16 _mm512_mask_cmpgt_epu32_mask(__mmask16 __U, __m512i __A, + __m512i __B) { return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__A, (__v16si)__B, 6, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_cmpgt_epu64_mask(__mmask8 __U, __m512i __A, __m512i __B) { +__funline __mmask8 _mm512_mask_cmpgt_epu64_mask(__mmask8 __U, __m512i __A, + __m512i __B) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 6, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_cmpgt_epu64_mask(__m512i __A, __m512i __B) { +__funline __mmask8 _mm512_cmpgt_epu64_mask(__m512i __A, __m512i __B) { return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__A, (__v8di)__B, 6, (__mmask8)-1); } @@ -12110,54 +10011,38 @@ extern __inline __mmask8 __v4si __T8 = __T6 op __T7; \ return __T8[0] op __T8[1] -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_add_epi32(__m512i __A) { +__funline int _mm512_reduce_add_epi32(__m512i __A) { __MM512_REDUCE_OP(+); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_mul_epi32(__m512i __A) { +__funline int _mm512_reduce_mul_epi32(__m512i __A) { __MM512_REDUCE_OP(*); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_and_epi32(__m512i __A) { +__funline int _mm512_reduce_and_epi32(__m512i __A) { __MM512_REDUCE_OP(&); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_or_epi32(__m512i __A) { +__funline int _mm512_reduce_or_epi32(__m512i __A) { __MM512_REDUCE_OP(|); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_add_epi32(__mmask16 __U, __m512i __A) { +__funline int _mm512_mask_reduce_add_epi32(__mmask16 __U, __m512i __A) { __A = _mm512_maskz_mov_epi32(__U, __A); __MM512_REDUCE_OP(+); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_mul_epi32(__mmask16 __U, __m512i __A) { +__funline int _mm512_mask_reduce_mul_epi32(__mmask16 __U, __m512i __A) { __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __U, __A); __MM512_REDUCE_OP(*); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_and_epi32(__mmask16 __U, __m512i __A) { +__funline int _mm512_mask_reduce_and_epi32(__mmask16 __U, __m512i __A) { __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0), __U, __A); __MM512_REDUCE_OP(&); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_or_epi32(__mmask16 __U, __m512i __A) { +__funline int _mm512_mask_reduce_or_epi32(__mmask16 __U, __m512i __A) { __A = _mm512_maskz_mov_epi32(__U, __A); __MM512_REDUCE_OP(|); } @@ -12178,54 +10063,38 @@ extern __inline int __v4si __T10 = (__v4si)_mm_##op(__T8, __T9); \ return __T10[0] -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_min_epi32(__m512i __A) { +__funline int _mm512_reduce_min_epi32(__m512i __A) { __MM512_REDUCE_OP(min_epi32); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_max_epi32(__m512i __A) { +__funline int _mm512_reduce_max_epi32(__m512i __A) { __MM512_REDUCE_OP(max_epi32); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_min_epu32(__m512i __A) { +__funline unsigned int _mm512_reduce_min_epu32(__m512i __A) { __MM512_REDUCE_OP(min_epu32); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_max_epu32(__m512i __A) { +__funline unsigned int _mm512_reduce_max_epu32(__m512i __A) { __MM512_REDUCE_OP(max_epu32); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_min_epi32(__mmask16 __U, __m512i __A) { +__funline int _mm512_mask_reduce_min_epi32(__mmask16 __U, __m512i __A) { __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __U, __A); __MM512_REDUCE_OP(min_epi32); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_max_epi32(__mmask16 __U, __m512i __A) { +__funline int _mm512_mask_reduce_max_epi32(__mmask16 __U, __m512i __A) { __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __U, __A); __MM512_REDUCE_OP(max_epi32); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_min_epu32(__mmask16 __U, __m512i __A) { +__funline unsigned int _mm512_mask_reduce_min_epu32(__mmask16 __U, __m512i __A) { __A = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0), __U, __A); __MM512_REDUCE_OP(min_epu32); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_max_epu32(__mmask16 __U, __m512i __A) { +__funline unsigned int _mm512_mask_reduce_max_epu32(__mmask16 __U, __m512i __A) { __A = _mm512_maskz_mov_epi32(__U, __A); __MM512_REDUCE_OP(max_epu32); } @@ -12242,28 +10111,20 @@ extern __inline unsigned int __m128 __T8 = __T6 op __T7; \ return __T8[0] op __T8[1] -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_add_ps(__m512 __A) { +__funline float _mm512_reduce_add_ps(__m512 __A) { __MM512_REDUCE_OP(+); } -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_mul_ps(__m512 __A) { +__funline float _mm512_reduce_mul_ps(__m512 __A) { __MM512_REDUCE_OP(*); } -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_add_ps(__mmask16 __U, __m512 __A) { +__funline float _mm512_mask_reduce_add_ps(__mmask16 __U, __m512 __A) { __A = _mm512_maskz_mov_ps(__U, __A); __MM512_REDUCE_OP(+); } -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_mul_ps(__mmask16 __U, __m512 __A) { +__funline float _mm512_mask_reduce_mul_ps(__mmask16 __U, __m512 __A) { __A = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __U, __A); __MM512_REDUCE_OP(*); } @@ -12282,28 +10143,20 @@ extern __inline float __m128 __T10 = _mm_##op(__T8, __T9); \ return __T10[0] -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_min_ps(__m512 __A) { +__funline float _mm512_reduce_min_ps(__m512 __A) { __MM512_REDUCE_OP(min_ps); } -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_max_ps(__m512 __A) { +__funline float _mm512_reduce_max_ps(__m512 __A) { __MM512_REDUCE_OP(max_ps); } -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_min_ps(__mmask16 __U, __m512 __A) { +__funline float _mm512_mask_reduce_min_ps(__mmask16 __U, __m512 __A) { __A = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __U, __A); __MM512_REDUCE_OP(min_ps); } -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_max_ps(__mmask16 __U, __m512 __A) { +__funline float _mm512_mask_reduce_max_ps(__mmask16 __U, __m512 __A) { __A = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __U, __A); __MM512_REDUCE_OP(max_ps); } @@ -12318,54 +10171,38 @@ extern __inline float __v2di __T6 = __T4 op __T5; \ return __T6[0] op __T6[1] -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_add_epi64(__m512i __A) { +__funline long long _mm512_reduce_add_epi64(__m512i __A) { __MM512_REDUCE_OP(+); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_mul_epi64(__m512i __A) { +__funline long long _mm512_reduce_mul_epi64(__m512i __A) { __MM512_REDUCE_OP(*); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_and_epi64(__m512i __A) { +__funline long long _mm512_reduce_and_epi64(__m512i __A) { __MM512_REDUCE_OP(&); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_or_epi64(__m512i __A) { +__funline long long _mm512_reduce_or_epi64(__m512i __A) { __MM512_REDUCE_OP(|); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_add_epi64(__mmask8 __U, __m512i __A) { +__funline long long _mm512_mask_reduce_add_epi64(__mmask8 __U, __m512i __A) { __A = _mm512_maskz_mov_epi64(__U, __A); __MM512_REDUCE_OP(+); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_mul_epi64(__mmask8 __U, __m512i __A) { +__funline long long _mm512_mask_reduce_mul_epi64(__mmask8 __U, __m512i __A) { __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(1LL), __U, __A); __MM512_REDUCE_OP(*); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_and_epi64(__mmask8 __U, __m512i __A) { +__funline long long _mm512_mask_reduce_and_epi64(__mmask8 __U, __m512i __A) { __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0LL), __U, __A); __MM512_REDUCE_OP(&); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_or_epi64(__mmask8 __U, __m512i __A) { +__funline long long _mm512_mask_reduce_or_epi64(__mmask8 __U, __m512i __A) { __A = _mm512_maskz_mov_epi64(__U, __A); __MM512_REDUCE_OP(|); } @@ -12382,55 +10219,41 @@ extern __inline long long __v8di __T6 = (__v8di)_mm512_##op(__T4, __T5); \ return __T6[0] -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_min_epi64(__m512i __A) { +__funline long long _mm512_reduce_min_epi64(__m512i __A) { __MM512_REDUCE_OP(min_epi64); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_max_epi64(__m512i __A) { +__funline long long _mm512_reduce_max_epi64(__m512i __A) { __MM512_REDUCE_OP(max_epi64); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_min_epi64(__mmask8 __U, __m512i __A) { +__funline long long _mm512_mask_reduce_min_epi64(__mmask8 __U, __m512i __A) { __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __U, __A); __MM512_REDUCE_OP(min_epi64); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_max_epi64(__mmask8 __U, __m512i __A) { +__funline long long _mm512_mask_reduce_max_epi64(__mmask8 __U, __m512i __A) { __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1), __U, __A); __MM512_REDUCE_OP(max_epi64); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_min_epu64(__m512i __A) { +__funline unsigned long long _mm512_reduce_min_epu64(__m512i __A) { __MM512_REDUCE_OP(min_epu64); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_max_epu64(__m512i __A) { +__funline unsigned long long _mm512_reduce_max_epu64(__m512i __A) { __MM512_REDUCE_OP(max_epu64); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_min_epu64(__mmask8 __U, __m512i __A) { +__funline unsigned long long _mm512_mask_reduce_min_epu64(__mmask8 __U, + __m512i __A) { __A = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0LL), __U, __A); __MM512_REDUCE_OP(min_epu64); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_max_epu64(__mmask8 __U, __m512i __A) { +__funline unsigned long long _mm512_mask_reduce_max_epu64(__mmask8 __U, + __m512i __A) { __A = _mm512_maskz_mov_epi64(__U, __A); __MM512_REDUCE_OP(max_epu64); } @@ -12445,28 +10268,20 @@ extern __inline unsigned long long __m128d __T6 = __T4 op __T5; \ return __T6[0] op __T6[1] -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_add_pd(__m512d __A) { +__funline double _mm512_reduce_add_pd(__m512d __A) { __MM512_REDUCE_OP(+); } -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_mul_pd(__m512d __A) { +__funline double _mm512_reduce_mul_pd(__m512d __A) { __MM512_REDUCE_OP(*); } -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_add_pd(__mmask8 __U, __m512d __A) { +__funline double _mm512_mask_reduce_add_pd(__mmask8 __U, __m512d __A) { __A = _mm512_maskz_mov_pd(__U, __A); __MM512_REDUCE_OP(+); } -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_mul_pd(__mmask8 __U, __m512d __A) { +__funline double _mm512_mask_reduce_mul_pd(__mmask8 __U, __m512d __A) { __A = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __U, __A); __MM512_REDUCE_OP(*); } @@ -12483,28 +10298,20 @@ extern __inline double __m128d __T8 = _mm_##op(__T6, __T7); \ return __T8[0] -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_min_pd(__m512d __A) { +__funline double _mm512_reduce_min_pd(__m512d __A) { __MM512_REDUCE_OP(min_pd); } -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_reduce_max_pd(__m512d __A) { +__funline double _mm512_reduce_max_pd(__m512d __A) { __MM512_REDUCE_OP(max_pd); } -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_min_pd(__mmask8 __U, __m512d __A) { +__funline double _mm512_mask_reduce_min_pd(__mmask8 __U, __m512d __A) { __A = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __U, __A); __MM512_REDUCE_OP(min_pd); } -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_reduce_max_pd(__mmask8 __U, __m512d __A) { +__funline double _mm512_mask_reduce_max_pd(__mmask8 __U, __m512d __A) { __A = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __U, __A); __MM512_REDUCE_OP(max_pd); } diff --git a/third_party/intel/avx512ifmaintrin.internal.h b/third_party/intel/avx512ifmaintrin.internal.h index 4efd6fe4f..48ea3859b 100644 --- a/third_party/intel/avx512ifmaintrin.internal.h +++ b/third_party/intel/avx512ifmaintrin.internal.h @@ -11,48 +11,36 @@ #define __DISABLE_AVX512IFMA__ #endif /* __AVX512IFMA__ */ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) { +__funline __m512i _mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) { return (__m512i)__builtin_ia32_vpmadd52luq512_mask((__v8di)__X, (__v8di)__Y, (__v8di)__Z, (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) { +__funline __m512i _mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) { return (__m512i)__builtin_ia32_vpmadd52huq512_mask((__v8di)__X, (__v8di)__Y, (__v8di)__Z, (__mmask8)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) { +__funline __m512i _mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, + __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_vpmadd52luq512_mask( (__v8di)__W, (__v8di)__X, (__v8di)__Y, (__mmask8)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) { +__funline __m512i _mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, + __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_vpmadd52huq512_mask( (__v8di)__W, (__v8di)__X, (__v8di)__Y, (__mmask8)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, __m512i __Y, - __m512i __Z) { +__funline __m512i _mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, + __m512i __Y, __m512i __Z) { return (__m512i)__builtin_ia32_vpmadd52luq512_maskz( (__v8di)__X, (__v8di)__Y, (__v8di)__Z, (__mmask8)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, __m512i __Y, - __m512i __Z) { +__funline __m512i _mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, + __m512i __Y, __m512i __Z) { return (__m512i)__builtin_ia32_vpmadd52huq512_maskz( (__v8di)__X, (__v8di)__Y, (__v8di)__Z, (__mmask8)__M); } diff --git a/third_party/intel/avx512ifmavlintrin.internal.h b/third_party/intel/avx512ifmavlintrin.internal.h index 2f7abafd3..7bc9d68ef 100644 --- a/third_party/intel/avx512ifmavlintrin.internal.h +++ b/third_party/intel/avx512ifmavlintrin.internal.h @@ -12,90 +12,70 @@ #define __DISABLE_AVX512IFMAVL__ #endif /* __AVX512IFMAVL__ */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) { +__funline __m128i _mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) { return (__m128i)__builtin_ia32_vpmadd52luq128_mask((__v2di)__X, (__v2di)__Y, (__v2di)__Z, (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) { +__funline __m128i _mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) { return (__m128i)__builtin_ia32_vpmadd52huq128_mask((__v2di)__X, (__v2di)__Y, (__v2di)__Z, (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) { +__funline __m256i _mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) { return (__m256i)__builtin_ia32_vpmadd52luq256_mask((__v4di)__X, (__v4di)__Y, (__v4di)__Z, (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) { +__funline __m256i _mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) { return (__m256i)__builtin_ia32_vpmadd52huq256_mask((__v4di)__X, (__v4di)__Y, (__v4di)__Z, (__mmask8)-1); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_vpmadd52luq128_mask( (__v2di)__W, (__v2di)__X, (__v2di)__Y, (__mmask8)__M); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_vpmadd52huq128_mask( (__v2di)__W, (__v2di)__X, (__v2di)__Y, (__mmask8)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_madd52lo_epu64(__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) { +__funline __m256i _mm256_mask_madd52lo_epu64(__m256i __W, __mmask8 __M, + __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_vpmadd52luq256_mask( (__v4di)__W, (__v4di)__X, (__v4di)__Y, (__mmask8)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_madd52hi_epu64(__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) { +__funline __m256i _mm256_mask_madd52hi_epu64(__m256i __W, __mmask8 __M, + __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_vpmadd52huq256_mask( (__v4di)__W, (__v4di)__X, (__v4di)__Y, (__mmask8)__M); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { +__funline __m128i _mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, + __m128i __Z) { return (__m128i)__builtin_ia32_vpmadd52luq128_maskz( (__v2di)__X, (__v2di)__Y, (__v2di)__Z, (__mmask8)__M); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { +__funline __m128i _mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, + __m128i __Z) { return (__m128i)__builtin_ia32_vpmadd52huq128_maskz( (__v2di)__X, (__v2di)__Y, (__v2di)__Z, (__mmask8)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_madd52lo_epu64(__mmask8 __M, __m256i __X, __m256i __Y, - __m256i __Z) { +__funline __m256i _mm256_maskz_madd52lo_epu64(__mmask8 __M, __m256i __X, + __m256i __Y, __m256i __Z) { return (__m256i)__builtin_ia32_vpmadd52luq256_maskz( (__v4di)__X, (__v4di)__Y, (__v4di)__Z, (__mmask8)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_madd52hi_epu64(__mmask8 __M, __m256i __X, __m256i __Y, - __m256i __Z) { +__funline __m256i _mm256_maskz_madd52hi_epu64(__mmask8 __M, __m256i __X, + __m256i __Y, __m256i __Z) { return (__m256i)__builtin_ia32_vpmadd52huq256_maskz( (__v4di)__X, (__v4di)__Y, (__v4di)__Z, (__mmask8)__M); } diff --git a/third_party/intel/avx512pfintrin.internal.h b/third_party/intel/avx512pfintrin.internal.h index e73bc9081..4401d24f6 100644 --- a/third_party/intel/avx512pfintrin.internal.h +++ b/third_party/intel/avx512pfintrin.internal.h @@ -18,132 +18,101 @@ typedef unsigned char __mmask8; typedef unsigned short __mmask16; #ifdef __OPTIMIZE__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_prefetch_i32gather_pd(__m256i __index, void const *__addr, - int __scale, int __hint) { +__funline void _mm512_prefetch_i32gather_pd(__m256i __index, void const *__addr, + int __scale, int __hint) { __builtin_ia32_gatherpfdpd((__mmask8)0xFF, (__v8si)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_prefetch_i32gather_ps(__m512i __index, void const *__addr, - int __scale, int __hint) { +__funline void _mm512_prefetch_i32gather_ps(__m512i __index, void const *__addr, + int __scale, int __hint) { __builtin_ia32_gatherpfdps((__mmask16)0xFFFF, (__v16si)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_prefetch_i32gather_pd(__m256i __index, __mmask8 __mask, - void const *__addr, int __scale, - int __hint) { +__funline void _mm512_mask_prefetch_i32gather_pd(__m256i __index, __mmask8 __mask, + void const *__addr, int __scale, + int __hint) { __builtin_ia32_gatherpfdpd(__mask, (__v8si)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_prefetch_i32gather_ps(__m512i __index, __mmask16 __mask, - void const *__addr, int __scale, - int __hint) { +__funline void _mm512_mask_prefetch_i32gather_ps(__m512i __index, + __mmask16 __mask, + void const *__addr, int __scale, + int __hint) { __builtin_ia32_gatherpfdps(__mask, (__v16si)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_prefetch_i64gather_pd(__m512i __index, void const *__addr, - int __scale, int __hint) { +__funline void _mm512_prefetch_i64gather_pd(__m512i __index, void const *__addr, + int __scale, int __hint) { __builtin_ia32_gatherpfqpd((__mmask8)0xFF, (__v8di)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_prefetch_i64gather_ps(__m512i __index, void const *__addr, - int __scale, int __hint) { +__funline void _mm512_prefetch_i64gather_ps(__m512i __index, void const *__addr, + int __scale, int __hint) { __builtin_ia32_gatherpfqps((__mmask8)0xFF, (__v8di)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_prefetch_i64gather_pd(__m512i __index, __mmask8 __mask, - void const *__addr, int __scale, - int __hint) { +__funline void _mm512_mask_prefetch_i64gather_pd(__m512i __index, __mmask8 __mask, + void const *__addr, int __scale, + int __hint) { __builtin_ia32_gatherpfqpd(__mask, (__v8di)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_prefetch_i64gather_ps(__m512i __index, __mmask8 __mask, - void const *__addr, int __scale, - int __hint) { +__funline void _mm512_mask_prefetch_i64gather_ps(__m512i __index, __mmask8 __mask, + void const *__addr, int __scale, + int __hint) { __builtin_ia32_gatherpfqps(__mask, (__v8di)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_prefetch_i32scatter_pd(void *__addr, __m256i __index, int __scale, - int __hint) { +__funline void _mm512_prefetch_i32scatter_pd(void *__addr, __m256i __index, + int __scale, int __hint) { __builtin_ia32_scatterpfdpd((__mmask8)0xFF, (__v8si)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_prefetch_i32scatter_ps(void *__addr, __m512i __index, int __scale, - int __hint) { +__funline void _mm512_prefetch_i32scatter_ps(void *__addr, __m512i __index, + int __scale, int __hint) { __builtin_ia32_scatterpfdps((__mmask16)0xFFFF, (__v16si)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_prefetch_i32scatter_pd(void *__addr, __mmask8 __mask, - __m256i __index, int __scale, - int __hint) { +__funline void _mm512_mask_prefetch_i32scatter_pd(void *__addr, __mmask8 __mask, + __m256i __index, int __scale, + int __hint) { __builtin_ia32_scatterpfdpd(__mask, (__v8si)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_prefetch_i32scatter_ps(void *__addr, __mmask16 __mask, - __m512i __index, int __scale, - int __hint) { +__funline void _mm512_mask_prefetch_i32scatter_ps(void *__addr, __mmask16 __mask, + __m512i __index, int __scale, + int __hint) { __builtin_ia32_scatterpfdps(__mask, (__v16si)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_prefetch_i64scatter_pd(void *__addr, __m512i __index, int __scale, - int __hint) { +__funline void _mm512_prefetch_i64scatter_pd(void *__addr, __m512i __index, + int __scale, int __hint) { __builtin_ia32_scatterpfqpd((__mmask8)0xFF, (__v8di)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_prefetch_i64scatter_ps(void *__addr, __m512i __index, int __scale, - int __hint) { +__funline void _mm512_prefetch_i64scatter_ps(void *__addr, __m512i __index, + int __scale, int __hint) { __builtin_ia32_scatterpfqps((__mmask8)0xFF, (__v8di)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_prefetch_i64scatter_pd(void *__addr, __mmask8 __mask, - __m512i __index, int __scale, - int __hint) { +__funline void _mm512_mask_prefetch_i64scatter_pd(void *__addr, __mmask8 __mask, + __m512i __index, int __scale, + int __hint) { __builtin_ia32_scatterpfqpd(__mask, (__v8di)__index, __addr, __scale, __hint); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_prefetch_i64scatter_ps(void *__addr, __mmask8 __mask, - __m512i __index, int __scale, - int __hint) { +__funline void _mm512_mask_prefetch_i64scatter_ps(void *__addr, __mmask8 __mask, + __m512i __index, int __scale, + int __hint) { __builtin_ia32_scatterpfqps(__mask, (__v8di)__index, __addr, __scale, __hint); } diff --git a/third_party/intel/avx512vbmi2intrin.internal.h b/third_party/intel/avx512vbmi2intrin.internal.h index 8200b23b9..1ab952209 100644 --- a/third_party/intel/avx512vbmi2intrin.internal.h +++ b/third_party/intel/avx512vbmi2intrin.internal.h @@ -13,101 +13,77 @@ #endif /* __AVX512VBMI2__ */ #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shrdi_epi16(__m512i __A, __m512i __B, int __C) { +__funline __m512i _mm512_shrdi_epi16(__m512i __A, __m512i __B, int __C) { return (__m512i)__builtin_ia32_vpshrd_v32hi((__v32hi)__A, (__v32hi)__B, __C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shrdi_epi32(__m512i __A, __m512i __B, int __C) { +__funline __m512i _mm512_shrdi_epi32(__m512i __A, __m512i __B, int __C) { return (__m512i)__builtin_ia32_vpshrd_v16si((__v16si)__A, (__v16si)__B, __C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shrdi_epi32(__m512i __A, __mmask16 __B, __m512i __C, - __m512i __D, int __E) { +__funline __m512i _mm512_mask_shrdi_epi32(__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D, int __E) { return (__m512i)__builtin_ia32_vpshrd_v16si_mask( (__v16si)__C, (__v16si)__D, __E, (__v16si)__A, (__mmask16)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shrdi_epi32(__mmask16 __A, __m512i __B, __m512i __C, int __D) { +__funline __m512i _mm512_maskz_shrdi_epi32(__mmask16 __A, __m512i __B, + __m512i __C, int __D) { return (__m512i)__builtin_ia32_vpshrd_v16si_mask( (__v16si)__B, (__v16si)__C, __D, (__v16si)_mm512_setzero_si512(), (__mmask16)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shrdi_epi64(__m512i __A, __m512i __B, int __C) { +__funline __m512i _mm512_shrdi_epi64(__m512i __A, __m512i __B, int __C) { return (__m512i)__builtin_ia32_vpshrd_v8di((__v8di)__A, (__v8di)__B, __C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shrdi_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D, - int __E) { +__funline __m512i _mm512_mask_shrdi_epi64(__m512i __A, __mmask8 __B, __m512i __C, + __m512i __D, int __E) { return (__m512i)__builtin_ia32_vpshrd_v8di_mask((__v8di)__C, (__v8di)__D, __E, (__v8di)__A, (__mmask8)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shrdi_epi64(__mmask8 __A, __m512i __B, __m512i __C, int __D) { +__funline __m512i _mm512_maskz_shrdi_epi64(__mmask8 __A, __m512i __B, __m512i __C, + int __D) { return (__m512i)__builtin_ia32_vpshrd_v8di_mask( (__v8di)__B, (__v8di)__C, __D, (__v8di)_mm512_setzero_si512(), (__mmask8)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shldi_epi16(__m512i __A, __m512i __B, int __C) { +__funline __m512i _mm512_shldi_epi16(__m512i __A, __m512i __B, int __C) { return (__m512i)__builtin_ia32_vpshld_v32hi((__v32hi)__A, (__v32hi)__B, __C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shldi_epi32(__m512i __A, __m512i __B, int __C) { +__funline __m512i _mm512_shldi_epi32(__m512i __A, __m512i __B, int __C) { return (__m512i)__builtin_ia32_vpshld_v16si((__v16si)__A, (__v16si)__B, __C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shldi_epi32(__m512i __A, __mmask16 __B, __m512i __C, - __m512i __D, int __E) { +__funline __m512i _mm512_mask_shldi_epi32(__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D, int __E) { return (__m512i)__builtin_ia32_vpshld_v16si_mask( (__v16si)__C, (__v16si)__D, __E, (__v16si)__A, (__mmask16)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shldi_epi32(__mmask16 __A, __m512i __B, __m512i __C, int __D) { +__funline __m512i _mm512_maskz_shldi_epi32(__mmask16 __A, __m512i __B, + __m512i __C, int __D) { return (__m512i)__builtin_ia32_vpshld_v16si_mask( (__v16si)__B, (__v16si)__C, __D, (__v16si)_mm512_setzero_si512(), (__mmask16)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shldi_epi64(__m512i __A, __m512i __B, int __C) { +__funline __m512i _mm512_shldi_epi64(__m512i __A, __m512i __B, int __C) { return (__m512i)__builtin_ia32_vpshld_v8di((__v8di)__A, (__v8di)__B, __C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shldi_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D, - int __E) { +__funline __m512i _mm512_mask_shldi_epi64(__m512i __A, __mmask8 __B, __m512i __C, + __m512i __D, int __E) { return (__m512i)__builtin_ia32_vpshld_v8di_mask((__v8di)__C, (__v8di)__D, __E, (__v8di)__A, (__mmask8)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shldi_epi64(__mmask8 __A, __m512i __B, __m512i __C, int __D) { +__funline __m512i _mm512_maskz_shldi_epi64(__mmask8 __A, __m512i __B, __m512i __C, + int __D) { return (__m512i)__builtin_ia32_vpshld_v8di_mask( (__v8di)__B, (__v8di)__C, __D, (__v8di)_mm512_setzero_si512(), (__mmask8)__A); @@ -161,99 +137,79 @@ extern __inline __m512i (__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A)) #endif -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C) { return (__m512i)__builtin_ia32_vpshrdv_v32hi((__v32hi)__A, (__v32hi)__B, (__v32hi)__C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C) { return (__m512i)__builtin_ia32_vpshrdv_v16si((__v16si)__A, (__v16si)__B, (__v16si)__C); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D) { return (__m512i)__builtin_ia32_vpshrdv_v16si_mask( (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_shrdv_epi32(__mmask16 __A, __m512i __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_maskz_shrdv_epi32(__mmask16 __A, __m512i __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpshrdv_v16si_maskz( (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C) { return (__m512i)__builtin_ia32_vpshrdv_v8di((__v8di)__A, (__v8di)__B, (__v8di)__C); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __B, __m512i __C, + __m512i __D) { return (__m512i)__builtin_ia32_vpshrdv_v8di_mask((__v8di)__A, (__v8di)__C, (__v8di)__D, (__mmask8)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_shrdv_epi64(__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_maskz_shrdv_epi64(__mmask8 __A, __m512i __B, __m512i __C, + __m512i __D) { return (__m512i)__builtin_ia32_vpshrdv_v8di_maskz((__v8di)__B, (__v8di)__C, (__v8di)__D, (__mmask8)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C) { return (__m512i)__builtin_ia32_vpshldv_v32hi((__v32hi)__A, (__v32hi)__B, (__v32hi)__C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C) { return (__m512i)__builtin_ia32_vpshldv_v16si((__v16si)__A, (__v16si)__B, (__v16si)__C); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_mask_shldv_epi32(__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D) { return (__m512i)__builtin_ia32_vpshldv_v16si_mask( (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_shldv_epi32(__mmask16 __A, __m512i __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_maskz_shldv_epi32(__mmask16 __A, __m512i __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpshldv_v16si_maskz( (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) { return (__m512i)__builtin_ia32_vpshldv_v8di((__v8di)__A, (__v8di)__B, (__v8di)__C); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_mask_shldv_epi64(__m512i __A, __mmask8 __B, __m512i __C, + __m512i __D) { return (__m512i)__builtin_ia32_vpshldv_v8di_mask((__v8di)__A, (__v8di)__C, (__v8di)__D, (__mmask8)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_shldv_epi64(__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_maskz_shldv_epi64(__mmask8 __A, __m512i __B, __m512i __C, + __m512i __D) { return (__m512i)__builtin_ia32_vpshldv_v8di_maskz((__v8di)__B, (__v8di)__C, (__v8di)__D, (__mmask8)__A); } @@ -270,132 +226,106 @@ _mm512_maskz_shldv_epi64(__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) { #define __DISABLE_AVX512VBMI2BW__ #endif /* __AVX512VBMI2BW__ */ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compress_epi8(__m512i __A, __mmask64 __B, __m512i __C) { +__funline __m512i _mm512_mask_compress_epi8(__m512i __A, __mmask64 __B, + __m512i __C) { return (__m512i)__builtin_ia32_compressqi512_mask((__v64qi)__C, (__v64qi)__A, (__mmask64)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_compress_epi8(__mmask64 __A, __m512i __B) { +__funline __m512i _mm512_maskz_compress_epi8(__mmask64 __A, __m512i __B) { return (__m512i)__builtin_ia32_compressqi512_mask( (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compressstoreu_epi8(void *__A, __mmask64 __B, __m512i __C) { +__funline void _mm512_mask_compressstoreu_epi8(void *__A, __mmask64 __B, + __m512i __C) { __builtin_ia32_compressstoreuqi512_mask((__v64qi *)__A, (__v64qi)__C, (__mmask64)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compress_epi16(__m512i __A, __mmask32 __B, __m512i __C) { +__funline __m512i _mm512_mask_compress_epi16(__m512i __A, __mmask32 __B, + __m512i __C) { return (__m512i)__builtin_ia32_compresshi512_mask((__v32hi)__C, (__v32hi)__A, (__mmask32)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_compress_epi16(__mmask32 __A, __m512i __B) { +__funline __m512i _mm512_maskz_compress_epi16(__mmask32 __A, __m512i __B) { return (__m512i)__builtin_ia32_compresshi512_mask( (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_compressstoreu_epi16(void *__A, __mmask32 __B, __m512i __C) { +__funline void _mm512_mask_compressstoreu_epi16(void *__A, __mmask32 __B, + __m512i __C) { __builtin_ia32_compressstoreuhi512_mask((__v32hi *)__A, (__v32hi)__C, (__mmask32)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expand_epi8(__m512i __A, __mmask64 __B, __m512i __C) { +__funline __m512i _mm512_mask_expand_epi8(__m512i __A, __mmask64 __B, + __m512i __C) { return (__m512i)__builtin_ia32_expandqi512_mask((__v64qi)__C, (__v64qi)__A, (__mmask64)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expand_epi8(__mmask64 __A, __m512i __B) { +__funline __m512i _mm512_maskz_expand_epi8(__mmask64 __A, __m512i __B) { return (__m512i)__builtin_ia32_expandqi512_maskz( (__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expandloadu_epi8(__m512i __A, __mmask64 __B, const void *__C) { +__funline __m512i _mm512_mask_expandloadu_epi8(__m512i __A, __mmask64 __B, + const void *__C) { return (__m512i)__builtin_ia32_expandloadqi512_mask( (const __v64qi *)__C, (__v64qi)__A, (__mmask64)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expandloadu_epi8(__mmask64 __A, const void *__B) { +__funline __m512i _mm512_maskz_expandloadu_epi8(__mmask64 __A, const void *__B) { return (__m512i)__builtin_ia32_expandloadqi512_maskz( (const __v64qi *)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expand_epi16(__m512i __A, __mmask32 __B, __m512i __C) { +__funline __m512i _mm512_mask_expand_epi16(__m512i __A, __mmask32 __B, + __m512i __C) { return (__m512i)__builtin_ia32_expandhi512_mask((__v32hi)__C, (__v32hi)__A, (__mmask32)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expand_epi16(__mmask32 __A, __m512i __B) { +__funline __m512i _mm512_maskz_expand_epi16(__mmask32 __A, __m512i __B) { return (__m512i)__builtin_ia32_expandhi512_maskz( (__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_expandloadu_epi16(__m512i __A, __mmask32 __B, const void *__C) { +__funline __m512i _mm512_mask_expandloadu_epi16(__m512i __A, __mmask32 __B, + const void *__C) { return (__m512i)__builtin_ia32_expandloadhi512_mask( (const __v32hi *)__C, (__v32hi)__A, (__mmask32)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_expandloadu_epi16(__mmask32 __A, const void *__B) { +__funline __m512i _mm512_maskz_expandloadu_epi16(__mmask32 __A, const void *__B) { return (__m512i)__builtin_ia32_expandloadhi512_maskz( (const __v32hi *)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shrdi_epi16(__m512i __A, __mmask32 __B, __m512i __C, - __m512i __D, int __E) { +__funline __m512i _mm512_mask_shrdi_epi16(__m512i __A, __mmask32 __B, __m512i __C, + __m512i __D, int __E) { return (__m512i)__builtin_ia32_vpshrd_v32hi_mask( (__v32hi)__C, (__v32hi)__D, __E, (__v32hi)__A, (__mmask32)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shrdi_epi16(__mmask32 __A, __m512i __B, __m512i __C, int __D) { +__funline __m512i _mm512_maskz_shrdi_epi16(__mmask32 __A, __m512i __B, + __m512i __C, int __D) { return (__m512i)__builtin_ia32_vpshrd_v32hi_mask( (__v32hi)__B, (__v32hi)__C, __D, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_shldi_epi16(__m512i __A, __mmask32 __B, __m512i __C, - __m512i __D, int __E) { +__funline __m512i _mm512_mask_shldi_epi16(__m512i __A, __mmask32 __B, __m512i __C, + __m512i __D, int __E) { return (__m512i)__builtin_ia32_vpshld_v32hi_mask( (__v32hi)__C, (__v32hi)__D, __E, (__v32hi)__A, (__mmask32)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_shldi_epi16(__mmask32 __A, __m512i __B, __m512i __C, int __D) { +__funline __m512i _mm512_maskz_shldi_epi16(__mmask32 __A, __m512i __B, + __m512i __C, int __D) { return (__m512i)__builtin_ia32_vpshld_v32hi_mask( (__v32hi)__B, (__v32hi)__C, __D, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A); @@ -418,30 +348,26 @@ extern __inline __m512i (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A)) #endif -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __B, __m512i __C, + __m512i __D) { return (__m512i)__builtin_ia32_vpshrdv_v32hi_mask( (__v32hi)__A, (__v32hi)__C, (__v32hi)__D, (__mmask32)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_shrdv_epi16(__mmask32 __A, __m512i __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_maskz_shrdv_epi16(__mmask32 __A, __m512i __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpshrdv_v32hi_maskz( (__v32hi)__B, (__v32hi)__C, (__v32hi)__D, (__mmask32)__A); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_mask_shldv_epi16(__m512i __A, __mmask32 __B, __m512i __C, + __m512i __D) { return (__m512i)__builtin_ia32_vpshldv_v32hi_mask( (__v32hi)__A, (__v32hi)__C, (__v32hi)__D, (__mmask32)__B); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_shldv_epi16(__mmask32 __A, __m512i __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_maskz_shldv_epi16(__mmask32 __A, __m512i __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpshldv_v32hi_maskz( (__v32hi)__B, (__v32hi)__C, (__v32hi)__D, (__mmask32)__A); } diff --git a/third_party/intel/avx512vbmi2vlintrin.internal.h b/third_party/intel/avx512vbmi2vlintrin.internal.h index 63083a57f..92bda6c13 100644 --- a/third_party/intel/avx512vbmi2vlintrin.internal.h +++ b/third_party/intel/avx512vbmi2vlintrin.internal.h @@ -12,414 +12,322 @@ #define __DISABLE_AVX512VBMI2VL__ #endif /* __AVX512VBMIVL__ */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compress_epi8(__m128i __A, __mmask16 __B, __m128i __C) { +__funline __m128i _mm_mask_compress_epi8(__m128i __A, __mmask16 __B, + __m128i __C) { return (__m128i)__builtin_ia32_compressqi128_mask((__v16qi)__C, (__v16qi)__A, (__mmask16)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_compress_epi8(__mmask16 __A, __m128i __B) { +__funline __m128i _mm_maskz_compress_epi8(__mmask16 __A, __m128i __B) { return (__m128i)__builtin_ia32_compressqi128_mask( (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compressstoreu_epi16(void *__A, __mmask16 __B, __m256i __C) { +__funline void _mm256_mask_compressstoreu_epi16(void *__A, __mmask16 __B, + __m256i __C) { __builtin_ia32_compressstoreuhi256_mask((__v16hi *)__A, (__v16hi)__C, (__mmask16)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compress_epi16(__m128i __A, __mmask8 __B, __m128i __C) { +__funline __m128i _mm_mask_compress_epi16(__m128i __A, __mmask8 __B, + __m128i __C) { return (__m128i)__builtin_ia32_compresshi128_mask((__v8hi)__C, (__v8hi)__A, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_compress_epi16(__mmask8 __A, __m128i __B) { +__funline __m128i _mm_maskz_compress_epi16(__mmask8 __A, __m128i __B) { return (__m128i)__builtin_ia32_compresshi128_mask( (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compress_epi16(__m256i __A, __mmask16 __B, __m256i __C) { +__funline __m256i _mm256_mask_compress_epi16(__m256i __A, __mmask16 __B, + __m256i __C) { return (__m256i)__builtin_ia32_compresshi256_mask((__v16hi)__C, (__v16hi)__A, (__mmask16)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_compress_epi16(__mmask16 __A, __m256i __B) { +__funline __m256i _mm256_maskz_compress_epi16(__mmask16 __A, __m256i __B) { return (__m256i)__builtin_ia32_compresshi256_mask( (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compressstoreu_epi8(void *__A, __mmask16 __B, __m128i __C) { +__funline void _mm_mask_compressstoreu_epi8(void *__A, __mmask16 __B, + __m128i __C) { __builtin_ia32_compressstoreuqi128_mask((__v16qi *)__A, (__v16qi)__C, (__mmask16)__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compressstoreu_epi16(void *__A, __mmask8 __B, __m128i __C) { +__funline void _mm_mask_compressstoreu_epi16(void *__A, __mmask8 __B, + __m128i __C) { __builtin_ia32_compressstoreuhi128_mask((__v8hi *)__A, (__v8hi)__C, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expand_epi8(__m128i __A, __mmask16 __B, __m128i __C) { +__funline __m128i _mm_mask_expand_epi8(__m128i __A, __mmask16 __B, __m128i __C) { return (__m128i)__builtin_ia32_expandqi128_mask((__v16qi)__C, (__v16qi)__A, (__mmask16)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expand_epi8(__mmask16 __A, __m128i __B) { +__funline __m128i _mm_maskz_expand_epi8(__mmask16 __A, __m128i __B) { return (__m128i)__builtin_ia32_expandqi128_maskz( (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expandloadu_epi8(__m128i __A, __mmask16 __B, const void *__C) { +__funline __m128i _mm_mask_expandloadu_epi8(__m128i __A, __mmask16 __B, + const void *__C) { return (__m128i)__builtin_ia32_expandloadqi128_mask( (const __v16qi *)__C, (__v16qi)__A, (__mmask16)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expandloadu_epi8(__mmask16 __A, const void *__B) { +__funline __m128i _mm_maskz_expandloadu_epi8(__mmask16 __A, const void *__B) { return (__m128i)__builtin_ia32_expandloadqi128_maskz( (const __v16qi *)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expand_epi16(__m128i __A, __mmask8 __B, __m128i __C) { +__funline __m128i _mm_mask_expand_epi16(__m128i __A, __mmask8 __B, __m128i __C) { return (__m128i)__builtin_ia32_expandhi128_mask((__v8hi)__C, (__v8hi)__A, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expand_epi16(__mmask8 __A, __m128i __B) { +__funline __m128i _mm_maskz_expand_epi16(__mmask8 __A, __m128i __B) { return (__m128i)__builtin_ia32_expandhi128_maskz( (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expandloadu_epi16(__m128i __A, __mmask8 __B, const void *__C) { +__funline __m128i _mm_mask_expandloadu_epi16(__m128i __A, __mmask8 __B, + const void *__C) { return (__m128i)__builtin_ia32_expandloadhi128_mask( (const __v8hi *)__C, (__v8hi)__A, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expandloadu_epi16(__mmask8 __A, const void *__B) { +__funline __m128i _mm_maskz_expandloadu_epi16(__mmask8 __A, const void *__B) { return (__m128i)__builtin_ia32_expandloadhi128_maskz( (const __v8hi *)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expand_epi16(__m256i __A, __mmask16 __B, __m256i __C) { +__funline __m256i _mm256_mask_expand_epi16(__m256i __A, __mmask16 __B, + __m256i __C) { return (__m256i)__builtin_ia32_expandhi256_mask((__v16hi)__C, (__v16hi)__A, (__mmask16)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expand_epi16(__mmask16 __A, __m256i __B) { +__funline __m256i _mm256_maskz_expand_epi16(__mmask16 __A, __m256i __B) { return (__m256i)__builtin_ia32_expandhi256_maskz( (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expandloadu_epi16(__m256i __A, __mmask16 __B, const void *__C) { +__funline __m256i _mm256_mask_expandloadu_epi16(__m256i __A, __mmask16 __B, + const void *__C) { return (__m256i)__builtin_ia32_expandloadhi256_mask( (const __v16hi *)__C, (__v16hi)__A, (__mmask16)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expandloadu_epi16(__mmask16 __A, const void *__B) { +__funline __m256i _mm256_maskz_expandloadu_epi16(__mmask16 __A, const void *__B) { return (__m256i)__builtin_ia32_expandloadhi256_maskz( (const __v16hi *)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shrdi_epi16(__m256i __A, __m256i __B, int __C) { +__funline __m256i _mm256_shrdi_epi16(__m256i __A, __m256i __B, int __C) { return (__m256i)__builtin_ia32_vpshrd_v16hi((__v16hi)__A, (__v16hi)__B, __C); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shrdi_epi16(__m256i __A, __mmask16 __B, __m256i __C, - __m256i __D, int __E) { +__funline __m256i _mm256_mask_shrdi_epi16(__m256i __A, __mmask16 __B, __m256i __C, + __m256i __D, int __E) { return (__m256i)__builtin_ia32_vpshrd_v16hi_mask( (__v16hi)__C, (__v16hi)__D, __E, (__v16hi)__A, (__mmask16)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shrdi_epi16(__mmask16 __A, __m256i __B, __m256i __C, int __D) { +__funline __m256i _mm256_maskz_shrdi_epi16(__mmask16 __A, __m256i __B, + __m256i __C, int __D) { return (__m256i)__builtin_ia32_vpshrd_v16hi_mask( (__v16hi)__B, (__v16hi)__C, __D, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shrdi_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, - int __E) { +__funline __m256i _mm256_mask_shrdi_epi32(__m256i __A, __mmask8 __B, __m256i __C, + __m256i __D, int __E) { return (__m256i)__builtin_ia32_vpshrd_v8si_mask((__v8si)__C, (__v8si)__D, __E, (__v8si)__A, (__mmask8)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shrdi_epi32(__mmask8 __A, __m256i __B, __m256i __C, int __D) { +__funline __m256i _mm256_maskz_shrdi_epi32(__mmask8 __A, __m256i __B, __m256i __C, + int __D) { return (__m256i)__builtin_ia32_vpshrd_v8si_mask( (__v8si)__B, (__v8si)__C, __D, (__v8si)_mm256_setzero_si256(), (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shrdi_epi32(__m256i __A, __m256i __B, int __C) { +__funline __m256i _mm256_shrdi_epi32(__m256i __A, __m256i __B, int __C) { return (__m256i)__builtin_ia32_vpshrd_v8si((__v8si)__A, (__v8si)__B, __C); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shrdi_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, - int __E) { +__funline __m256i _mm256_mask_shrdi_epi64(__m256i __A, __mmask8 __B, __m256i __C, + __m256i __D, int __E) { return (__m256i)__builtin_ia32_vpshrd_v4di_mask((__v4di)__C, (__v4di)__D, __E, (__v4di)__A, (__mmask8)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shrdi_epi64(__mmask8 __A, __m256i __B, __m256i __C, int __D) { +__funline __m256i _mm256_maskz_shrdi_epi64(__mmask8 __A, __m256i __B, __m256i __C, + int __D) { return (__m256i)__builtin_ia32_vpshrd_v4di_mask( (__v4di)__B, (__v4di)__C, __D, (__v4di)_mm256_setzero_si256(), (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shrdi_epi64(__m256i __A, __m256i __B, int __C) { +__funline __m256i _mm256_shrdi_epi64(__m256i __A, __m256i __B, int __C) { return (__m256i)__builtin_ia32_vpshrd_v4di((__v4di)__A, (__v4di)__B, __C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shrdi_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, - int __E) { +__funline __m128i _mm_mask_shrdi_epi16(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D, int __E) { return (__m128i)__builtin_ia32_vpshrd_v8hi_mask((__v8hi)__C, (__v8hi)__D, __E, (__v8hi)__A, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shrdi_epi16(__mmask8 __A, __m128i __B, __m128i __C, int __D) { +__funline __m128i _mm_maskz_shrdi_epi16(__mmask8 __A, __m128i __B, __m128i __C, + int __D) { return (__m128i)__builtin_ia32_vpshrd_v8hi_mask((__v8hi)__B, (__v8hi)__C, __D, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shrdi_epi16(__m128i __A, __m128i __B, int __C) { +__funline __m128i _mm_shrdi_epi16(__m128i __A, __m128i __B, int __C) { return (__m128i)__builtin_ia32_vpshrd_v8hi((__v8hi)__A, (__v8hi)__B, __C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shrdi_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, - int __E) { +__funline __m128i _mm_mask_shrdi_epi32(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D, int __E) { return (__m128i)__builtin_ia32_vpshrd_v4si_mask((__v4si)__C, (__v4si)__D, __E, (__v4si)__A, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shrdi_epi32(__mmask8 __A, __m128i __B, __m128i __C, int __D) { +__funline __m128i _mm_maskz_shrdi_epi32(__mmask8 __A, __m128i __B, __m128i __C, + int __D) { return (__m128i)__builtin_ia32_vpshrd_v4si_mask((__v4si)__B, (__v4si)__C, __D, (__v4si)_mm_setzero_si128(), (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shrdi_epi32(__m128i __A, __m128i __B, int __C) { +__funline __m128i _mm_shrdi_epi32(__m128i __A, __m128i __B, int __C) { return (__m128i)__builtin_ia32_vpshrd_v4si((__v4si)__A, (__v4si)__B, __C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shrdi_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, - int __E) { +__funline __m128i _mm_mask_shrdi_epi64(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D, int __E) { return (__m128i)__builtin_ia32_vpshrd_v2di_mask((__v2di)__C, (__v2di)__D, __E, (__v2di)__A, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shrdi_epi64(__mmask8 __A, __m128i __B, __m128i __C, int __D) { +__funline __m128i _mm_maskz_shrdi_epi64(__mmask8 __A, __m128i __B, __m128i __C, + int __D) { return (__m128i)__builtin_ia32_vpshrd_v2di_mask((__v2di)__B, (__v2di)__C, __D, (__v2di)_mm_setzero_si128(), (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shrdi_epi64(__m128i __A, __m128i __B, int __C) { +__funline __m128i _mm_shrdi_epi64(__m128i __A, __m128i __B, int __C) { return (__m128i)__builtin_ia32_vpshrd_v2di((__v2di)__A, (__v2di)__B, __C); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shldi_epi16(__m256i __A, __m256i __B, int __C) { +__funline __m256i _mm256_shldi_epi16(__m256i __A, __m256i __B, int __C) { return (__m256i)__builtin_ia32_vpshld_v16hi((__v16hi)__A, (__v16hi)__B, __C); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shldi_epi16(__m256i __A, __mmask16 __B, __m256i __C, - __m256i __D, int __E) { +__funline __m256i _mm256_mask_shldi_epi16(__m256i __A, __mmask16 __B, __m256i __C, + __m256i __D, int __E) { return (__m256i)__builtin_ia32_vpshld_v16hi_mask( (__v16hi)__C, (__v16hi)__D, __E, (__v16hi)__A, (__mmask16)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shldi_epi16(__mmask16 __A, __m256i __B, __m256i __C, int __D) { +__funline __m256i _mm256_maskz_shldi_epi16(__mmask16 __A, __m256i __B, + __m256i __C, int __D) { return (__m256i)__builtin_ia32_vpshld_v16hi_mask( (__v16hi)__B, (__v16hi)__C, __D, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shldi_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, - int __E) { +__funline __m256i _mm256_mask_shldi_epi32(__m256i __A, __mmask8 __B, __m256i __C, + __m256i __D, int __E) { return (__m256i)__builtin_ia32_vpshld_v8si_mask((__v8si)__C, (__v8si)__D, __E, (__v8si)__A, (__mmask8)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shldi_epi32(__mmask8 __A, __m256i __B, __m256i __C, int __D) { +__funline __m256i _mm256_maskz_shldi_epi32(__mmask8 __A, __m256i __B, __m256i __C, + int __D) { return (__m256i)__builtin_ia32_vpshld_v8si_mask( (__v8si)__B, (__v8si)__C, __D, (__v8si)_mm256_setzero_si256(), (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shldi_epi32(__m256i __A, __m256i __B, int __C) { +__funline __m256i _mm256_shldi_epi32(__m256i __A, __m256i __B, int __C) { return (__m256i)__builtin_ia32_vpshld_v8si((__v8si)__A, (__v8si)__B, __C); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shldi_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, - int __E) { +__funline __m256i _mm256_mask_shldi_epi64(__m256i __A, __mmask8 __B, __m256i __C, + __m256i __D, int __E) { return (__m256i)__builtin_ia32_vpshld_v4di_mask((__v4di)__C, (__v4di)__D, __E, (__v4di)__A, (__mmask8)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shldi_epi64(__mmask8 __A, __m256i __B, __m256i __C, int __D) { +__funline __m256i _mm256_maskz_shldi_epi64(__mmask8 __A, __m256i __B, __m256i __C, + int __D) { return (__m256i)__builtin_ia32_vpshld_v4di_mask( (__v4di)__B, (__v4di)__C, __D, (__v4di)_mm256_setzero_si256(), (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shldi_epi64(__m256i __A, __m256i __B, int __C) { +__funline __m256i _mm256_shldi_epi64(__m256i __A, __m256i __B, int __C) { return (__m256i)__builtin_ia32_vpshld_v4di((__v4di)__A, (__v4di)__B, __C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shldi_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, - int __E) { +__funline __m128i _mm_mask_shldi_epi16(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D, int __E) { return (__m128i)__builtin_ia32_vpshld_v8hi_mask((__v8hi)__C, (__v8hi)__D, __E, (__v8hi)__A, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shldi_epi16(__mmask8 __A, __m128i __B, __m128i __C, int __D) { +__funline __m128i _mm_maskz_shldi_epi16(__mmask8 __A, __m128i __B, __m128i __C, + int __D) { return (__m128i)__builtin_ia32_vpshld_v8hi_mask((__v8hi)__B, (__v8hi)__C, __D, (__v8hi)_mm_setzero_si128(), (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shldi_epi16(__m128i __A, __m128i __B, int __C) { +__funline __m128i _mm_shldi_epi16(__m128i __A, __m128i __B, int __C) { return (__m128i)__builtin_ia32_vpshld_v8hi((__v8hi)__A, (__v8hi)__B, __C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shldi_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, - int __E) { +__funline __m128i _mm_mask_shldi_epi32(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D, int __E) { return (__m128i)__builtin_ia32_vpshld_v4si_mask((__v4si)__C, (__v4si)__D, __E, (__v4si)__A, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shldi_epi32(__mmask8 __A, __m128i __B, __m128i __C, int __D) { +__funline __m128i _mm_maskz_shldi_epi32(__mmask8 __A, __m128i __B, __m128i __C, + int __D) { return (__m128i)__builtin_ia32_vpshld_v4si_mask((__v4si)__B, (__v4si)__C, __D, (__v4si)_mm_setzero_si128(), (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shldi_epi32(__m128i __A, __m128i __B, int __C) { +__funline __m128i _mm_shldi_epi32(__m128i __A, __m128i __B, int __C) { return (__m128i)__builtin_ia32_vpshld_v4si((__v4si)__A, (__v4si)__B, __C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shldi_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, - int __E) { +__funline __m128i _mm_mask_shldi_epi64(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D, int __E) { return (__m128i)__builtin_ia32_vpshld_v2di_mask((__v2di)__C, (__v2di)__D, __E, (__v2di)__A, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shldi_epi64(__mmask8 __A, __m128i __B, __m128i __C, int __D) { +__funline __m128i _mm_maskz_shldi_epi64(__mmask8 __A, __m128i __B, __m128i __C, + int __D) { return (__m128i)__builtin_ia32_vpshld_v2di_mask((__v2di)__B, (__v2di)__C, __D, (__v2di)_mm_setzero_si128(), (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shldi_epi64(__m128i __A, __m128i __B, int __C) { +__funline __m128i _mm_shldi_epi64(__m128i __A, __m128i __B, int __C) { return (__m128i)__builtin_ia32_vpshld_v2di((__v2di)__A, (__v2di)__B, __C); } #else @@ -545,254 +453,206 @@ extern __inline __m128i (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)) #endif -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)__builtin_ia32_vpshrdv_v16hi((__v16hi)__A, (__v16hi)__B, (__v16hi)__C); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpshrdv_v16hi_mask( (__v16hi)__A, (__v16hi)__C, (__v16hi)__D, (__mmask16)__B); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_shrdv_epi16(__mmask16 __A, __m256i __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_maskz_shrdv_epi16(__mmask16 __A, __m256i __B, + __m256i __C, __m256i __D) { return (__m256i)__builtin_ia32_vpshrdv_v16hi_maskz( (__v16hi)__B, (__v16hi)__C, (__v16hi)__D, (__mmask16)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)__builtin_ia32_vpshrdv_v8si((__v8si)__A, (__v8si)__B, (__v8si)__C); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpshrdv_v8si_mask((__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_shrdv_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_maskz_shrdv_epi32(__mmask8 __A, __m256i __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpshrdv_v8si_maskz((__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)__builtin_ia32_vpshrdv_v4di((__v4di)__A, (__v4di)__B, (__v4di)__C); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpshrdv_v4di_mask((__v4di)__A, (__v4di)__C, (__v4di)__D, (__mmask8)__B); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_shrdv_epi64(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_maskz_shrdv_epi64(__mmask8 __A, __m256i __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpshrdv_v4di_maskz((__v4di)__B, (__v4di)__C, (__v4di)__D, (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpshrdv_v8hi((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shrdv_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_shrdv_epi16(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshrdv_v8hi_mask((__v8hi)__A, (__v8hi)__C, (__v8hi)__D, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shrdv_epi16(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_maskz_shrdv_epi16(__mmask8 __A, __m128i __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshrdv_v8hi_maskz((__v8hi)__B, (__v8hi)__C, (__v8hi)__D, (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpshrdv_v4si((__v4si)__A, (__v4si)__B, (__v4si)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shrdv_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_shrdv_epi32(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshrdv_v4si_mask((__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shrdv_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_maskz_shrdv_epi32(__mmask8 __A, __m128i __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshrdv_v4si_maskz((__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpshrdv_v2di((__v2di)__A, (__v2di)__B, (__v2di)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shrdv_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_shrdv_epi64(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshrdv_v2di_mask((__v2di)__A, (__v2di)__C, (__v2di)__D, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shrdv_epi64(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_maskz_shrdv_epi64(__mmask8 __A, __m128i __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshrdv_v2di_maskz((__v2di)__B, (__v2di)__C, (__v2di)__D, (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)__builtin_ia32_vpshldv_v16hi((__v16hi)__A, (__v16hi)__B, (__v16hi)__C); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_mask_shldv_epi16(__m256i __A, __mmask16 __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpshldv_v16hi_mask( (__v16hi)__A, (__v16hi)__C, (__v16hi)__D, (__mmask16)__B); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_shldv_epi16(__mmask16 __A, __m256i __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_maskz_shldv_epi16(__mmask16 __A, __m256i __B, + __m256i __C, __m256i __D) { return (__m256i)__builtin_ia32_vpshldv_v16hi_maskz( (__v16hi)__B, (__v16hi)__C, (__v16hi)__D, (__mmask16)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)__builtin_ia32_vpshldv_v8si((__v8si)__A, (__v8si)__B, (__v8si)__C); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_mask_shldv_epi32(__m256i __A, __mmask8 __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpshldv_v8si_mask((__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_shldv_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_maskz_shldv_epi32(__mmask8 __A, __m256i __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpshldv_v8si_maskz((__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)__builtin_ia32_vpshldv_v4di((__v4di)__A, (__v4di)__B, (__v4di)__C); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_mask_shldv_epi64(__m256i __A, __mmask8 __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpshldv_v4di_mask((__v4di)__A, (__v4di)__C, (__v4di)__D, (__mmask8)__B); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_shldv_epi64(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_maskz_shldv_epi64(__mmask8 __A, __m256i __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpshldv_v4di_maskz((__v4di)__B, (__v4di)__C, (__v4di)__D, (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpshldv_v8hi((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shldv_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_shldv_epi16(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshldv_v8hi_mask((__v8hi)__A, (__v8hi)__C, (__v8hi)__D, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shldv_epi16(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_maskz_shldv_epi16(__mmask8 __A, __m128i __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshldv_v8hi_maskz((__v8hi)__B, (__v8hi)__C, (__v8hi)__D, (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpshldv_v4si((__v4si)__A, (__v4si)__B, (__v4si)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shldv_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_shldv_epi32(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshldv_v4si_mask((__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shldv_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_maskz_shldv_epi32(__mmask8 __A, __m128i __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshldv_v4si_maskz((__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpshldv_v2di((__v2di)__A, (__v2di)__B, (__v2di)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shldv_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_shldv_epi64(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshldv_v2di_mask((__v2di)__A, (__v2di)__C, (__v2di)__D, (__mmask8)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shldv_epi64(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_maskz_shldv_epi64(__mmask8 __A, __m128i __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpshldv_v2di_maskz((__v2di)__B, (__v2di)__C, (__v2di)__D, (__mmask8)__A); } @@ -809,51 +669,41 @@ extern __inline __m128i #define __DISABLE_AVX512VBMI2VLBW__ #endif /* __AVX512VBMIVLBW__ */ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compress_epi8(__m256i __A, __mmask32 __B, __m256i __C) { +__funline __m256i _mm256_mask_compress_epi8(__m256i __A, __mmask32 __B, + __m256i __C) { return (__m256i)__builtin_ia32_compressqi256_mask((__v32qi)__C, (__v32qi)__A, (__mmask32)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_compress_epi8(__mmask32 __A, __m256i __B) { +__funline __m256i _mm256_maskz_compress_epi8(__mmask32 __A, __m256i __B) { return (__m256i)__builtin_ia32_compressqi256_mask( (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compressstoreu_epi8(void *__A, __mmask32 __B, __m256i __C) { +__funline void _mm256_mask_compressstoreu_epi8(void *__A, __mmask32 __B, + __m256i __C) { __builtin_ia32_compressstoreuqi256_mask((__v32qi *)__A, (__v32qi)__C, (__mmask32)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expand_epi8(__m256i __A, __mmask32 __B, __m256i __C) { +__funline __m256i _mm256_mask_expand_epi8(__m256i __A, __mmask32 __B, + __m256i __C) { return (__m256i)__builtin_ia32_expandqi256_mask((__v32qi)__C, (__v32qi)__A, (__mmask32)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expand_epi8(__mmask32 __A, __m256i __B) { +__funline __m256i _mm256_maskz_expand_epi8(__mmask32 __A, __m256i __B) { return (__m256i)__builtin_ia32_expandqi256_maskz( (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expandloadu_epi8(__m256i __A, __mmask32 __B, const void *__C) { +__funline __m256i _mm256_mask_expandloadu_epi8(__m256i __A, __mmask32 __B, + const void *__C) { return (__m256i)__builtin_ia32_expandloadqi256_mask( (const __v32qi *)__C, (__v32qi)__A, (__mmask32)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expandloadu_epi8(__mmask32 __A, const void *__B) { +__funline __m256i _mm256_maskz_expandloadu_epi8(__mmask32 __A, const void *__B) { return (__m256i)__builtin_ia32_expandloadqi256_maskz( (const __v32qi *)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A); } diff --git a/third_party/intel/avx512vbmiintrin.internal.h b/third_party/intel/avx512vbmiintrin.internal.h index e0b4f1f71..dad021826 100644 --- a/third_party/intel/avx512vbmiintrin.internal.h +++ b/third_party/intel/avx512vbmiintrin.internal.h @@ -11,77 +11,62 @@ #define __DISABLE_AVX512VBMI__ #endif /* __AVX512VBMI__ */ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X, - __m512i __Y) { +__funline __m512i _mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, + __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_vpmultishiftqb512_mask( (__v64qi)__X, (__v64qi)__Y, (__v64qi)__W, (__mmask64)__M); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) { +__funline __m512i _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, + __m512i __Y) { return (__m512i)__builtin_ia32_vpmultishiftqb512_mask( (__v64qi)__X, (__v64qi)__Y, (__v64qi)_mm512_setzero_si512(), (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) { +__funline __m512i _mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_vpmultishiftqb512_mask( (__v64qi)__X, (__v64qi)__Y, (__v64qi)_mm512_undefined_epi32(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutexvar_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_permutexvar_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarqi512_mask( (__v64qi)__B, (__v64qi)__A, (__v64qi)_mm512_undefined_epi32(), (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) { +__funline __m512i _mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_permvarqi512_mask( (__v64qi)__B, (__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) { +__funline __m512i _mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, + __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_permvarqi512_mask( (__v64qi)__B, (__v64qi)__A, (__v64qi)__W, (__mmask64)__M); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) { +__funline __m512i _mm512_permutex2var_epi8(__m512i __A, __m512i __I, + __m512i __B) { return (__m512i)__builtin_ia32_vpermt2varqi512_mask( (__v64qi)__I /* idx */, (__v64qi)__A, (__v64qi)__B, (__mmask64)-1); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, - __m512i __B) { +__funline __m512i _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, + __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermt2varqi512_mask( (__v64qi)__I /* idx */, (__v64qi)__A, (__v64qi)__B, (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, - __m512i __B) { +__funline __m512i _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, + __mmask64 __U, __m512i __B) { return (__m512i)__builtin_ia32_vpermi2varqi512_mask((__v64qi)__A, (__v64qi)__I /* idx */, @@ -89,10 +74,8 @@ extern __inline __m512i (__mmask64)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, - __m512i __B) { +__funline __m512i _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, + __m512i __I, __m512i __B) { return (__m512i)__builtin_ia32_vpermt2varqi512_maskz( (__v64qi)__I /* idx */, diff --git a/third_party/intel/avx512vbmivlintrin.internal.h b/third_party/intel/avx512vbmivlintrin.internal.h index 53db4b9ab..a7ff671e6 100644 --- a/third_party/intel/avx512vbmivlintrin.internal.h +++ b/third_party/intel/avx512vbmivlintrin.internal.h @@ -12,123 +12,98 @@ #define __DISABLE_AVX512VBMIVL__ #endif /* __AVX512VBMIVL__ */ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X, - __m256i __Y) { +__funline __m256i _mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, + __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_vpmultishiftqb256_mask( (__v32qi)__X, (__v32qi)__Y, (__v32qi)__W, (__mmask32)__M); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_vpmultishiftqb256_mask( (__v32qi)__X, (__v32qi)__Y, (__v32qi)_mm256_setzero_si256(), (__mmask32)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_vpmultishiftqb256_mask( (__v32qi)__X, (__v32qi)__Y, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, - __m128i __Y) { +__funline __m128i _mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, + __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_vpmultishiftqb128_mask( (__v16qi)__X, (__v16qi)__Y, (__v16qi)__W, (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_vpmultishiftqb128_mask( (__v16qi)__X, (__v16qi)__Y, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) { +__funline __m128i _mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_vpmultishiftqb128_mask( (__v16qi)__X, (__v16qi)__Y, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutexvar_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_permutexvar_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarqi256_mask( (__v32qi)__B, (__v32qi)__A, (__v32qi)_mm256_undefined_si256(), (__mmask32)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_permvarqi256_mask( (__v32qi)__B, (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) { +__funline __m256i _mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarqi256_mask( (__v32qi)__B, (__v32qi)__A, (__v32qi)__W, (__mmask32)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permutexvar_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_permutexvar_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarqi128_mask( (__v16qi)__B, (__v16qi)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_permvarqi128_mask( (__v16qi)__B, (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) { +__funline __m128i _mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, + __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarqi128_mask( (__v16qi)__B, (__v16qi)__A, (__v16qi)__W, (__mmask16)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { +__funline __m256i _mm256_permutex2var_epi8(__m256i __A, __m256i __I, + __m256i __B) { return (__m256i)__builtin_ia32_vpermt2varqi256_mask( (__v32qi)__I /* idx */, (__v32qi)__A, (__v32qi)__B, (__mmask32)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, - __m256i __B) { +__funline __m256i _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, + __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermt2varqi256_mask( (__v32qi)__I /* idx */, (__v32qi)__A, (__v32qi)__B, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, - __m256i __B) { +__funline __m256i _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, + __mmask32 __U, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varqi256_mask((__v32qi)__A, (__v32qi)__I /* idx */, @@ -136,39 +111,31 @@ extern __inline __m256i (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, - __m256i __B) { +__funline __m256i _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, + __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermt2varqi256_maskz( (__v32qi)__I /* idx */, (__v32qi)__A, (__v32qi)__B, (__mmask32)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) { +__funline __m128i _mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2varqi128_mask( (__v16qi)__I /* idx */, (__v16qi)__A, (__v16qi)__B, (__mmask16)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, - __m128i __B) { +__funline __m128i _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, + __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2varqi128_mask( (__v16qi)__I /* idx */, (__v16qi)__A, (__v16qi)__B, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, - __m128i __B) { +__funline __m128i _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, + __mmask16 __U, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varqi128_mask((__v16qi)__A, (__v16qi)__I /* idx */, @@ -176,10 +143,8 @@ extern __inline __m128i (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, - __m128i __B) { +__funline __m128i _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, + __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2varqi128_maskz( (__v16qi)__I /* idx */, diff --git a/third_party/intel/avx512vlbwintrin.internal.h b/third_party/intel/avx512vlbwintrin.internal.h index 836367405..474b39b49 100644 --- a/third_party/intel/avx512vlbwintrin.internal.h +++ b/third_party/intel/avx512vlbwintrin.internal.h @@ -11,450 +11,345 @@ #define __DISABLE_AVX512VLBW__ #endif /* __AVX512VLBW__ */ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_mov_epi8(__m256i __W, __mmask32 __U, __m256i __A) { +__funline __m256i _mm256_mask_mov_epi8(__m256i __W, __mmask32 __U, __m256i __A) { return (__m256i)__builtin_ia32_movdquqi256_mask((__v32qi)__A, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) { +__funline __m256i _mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) { return (__m256i)__builtin_ia32_movdquqi256_mask( (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mov_epi8(__m128i __W, __mmask16 __U, __m128i __A) { +__funline __m128i _mm_mask_mov_epi8(__m128i __W, __mmask16 __U, __m128i __A) { return (__m128i)__builtin_ia32_movdquqi128_mask((__v16qi)__A, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mov_epi8(__mmask16 __U, __m128i __A) { +__funline __m128i _mm_maskz_mov_epi8(__mmask16 __U, __m128i __A) { return (__m128i)__builtin_ia32_movdquqi128_mask( (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_storeu_epi8(void *__P, __mmask32 __U, __m256i __A) { +__funline void _mm256_mask_storeu_epi8(void *__P, __mmask32 __U, __m256i __A) { __builtin_ia32_storedquqi256_mask((char *)__P, (__v32qi)__A, (__mmask32)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_storeu_epi8(void *__P, __mmask16 __U, __m128i __A) { +__funline void _mm_mask_storeu_epi8(void *__P, __mmask16 __U, __m128i __A) { __builtin_ia32_storedquqi128_mask((char *)__P, (__v16qi)__A, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_loadu_epi16(__m256i __W, __mmask16 __U, void const *__P) { +__funline __m256i _mm256_mask_loadu_epi16(__m256i __W, __mmask16 __U, + void const *__P) { return (__m256i)__builtin_ia32_loaddquhi256_mask( (const short *)__P, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_loadu_epi16(__mmask16 __U, void const *__P) { +__funline __m256i _mm256_maskz_loadu_epi16(__mmask16 __U, void const *__P) { return (__m256i)__builtin_ia32_loaddquhi256_mask( (const short *)__P, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_loadu_epi16(__m128i __W, __mmask8 __U, void const *__P) { +__funline __m128i _mm_mask_loadu_epi16(__m128i __W, __mmask8 __U, + void const *__P) { return (__m128i)__builtin_ia32_loaddquhi128_mask((const short *)__P, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_loadu_epi16(__mmask8 __U, void const *__P) { +__funline __m128i _mm_maskz_loadu_epi16(__mmask8 __U, void const *__P) { return (__m128i)__builtin_ia32_loaddquhi128_mask( (const short *)__P, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_mov_epi16(__m256i __W, __mmask16 __U, __m256i __A) { +__funline __m256i _mm256_mask_mov_epi16(__m256i __W, __mmask16 __U, __m256i __A) { return (__m256i)__builtin_ia32_movdquhi256_mask((__v16hi)__A, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mov_epi16(__mmask16 __U, __m256i __A) { +__funline __m256i _mm256_maskz_mov_epi16(__mmask16 __U, __m256i __A) { return (__m256i)__builtin_ia32_movdquhi256_mask( (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mov_epi16(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_mov_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_movdquhi128_mask((__v8hi)__A, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mov_epi16(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_mov_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_movdquhi128_mask( (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_loadu_epi8(__m256i __W, __mmask32 __U, void const *__P) { +__funline __m256i _mm256_mask_loadu_epi8(__m256i __W, __mmask32 __U, + void const *__P) { return (__m256i)__builtin_ia32_loaddquqi256_mask( (const char *)__P, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_loadu_epi8(__mmask32 __U, void const *__P) { +__funline __m256i _mm256_maskz_loadu_epi8(__mmask32 __U, void const *__P) { return (__m256i)__builtin_ia32_loaddquqi256_mask( (const char *)__P, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_loadu_epi8(__m128i __W, __mmask16 __U, void const *__P) { +__funline __m128i _mm_mask_loadu_epi8(__m128i __W, __mmask16 __U, + void const *__P) { return (__m128i)__builtin_ia32_loaddquqi128_mask( (const char *)__P, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_loadu_epi8(__mmask16 __U, void const *__P) { +__funline __m128i _mm_maskz_loadu_epi8(__mmask16 __U, void const *__P) { return (__m128i)__builtin_ia32_loaddquqi128_mask( (const char *)__P, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi16_epi8(__m256i __A) { +__funline __m128i _mm256_cvtepi16_epi8(__m256i __A) { return (__m128i)__builtin_ia32_pmovwb256_mask( (__v16hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi16_storeu_epi8(void *__P, __mmask16 __M, __m256i __A) { +__funline void _mm256_mask_cvtepi16_storeu_epi8(void *__P, __mmask16 __M, + __m256i __A) { __builtin_ia32_pmovwb256mem_mask((__v16qi *)__P, (__v16hi)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi16_epi8(__m128i __O, __mmask16 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtepi16_epi8(__m128i __O, __mmask16 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovwb256_mask((__v16hi)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi16_epi8(__mmask16 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtepi16_epi8(__mmask16 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovwb256_mask( (__v16hi)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsepi16_epi8(__m128i __A) { +__funline __m128i _mm_cvtsepi16_epi8(__m128i __A) { return (__m128i)__builtin_ia32_pmovswb128_mask( (__v8hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi16_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtsepi16_storeu_epi8(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovswb128mem_mask((__v8qi *)__P, (__v8hi)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtsepi16_epi8(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovswb128_mask((__v8hi)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtsepi16_epi8(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtsepi16_epi8(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovswb128_mask( (__v8hi)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtsepi16_epi8(__m256i __A) { +__funline __m128i _mm256_cvtsepi16_epi8(__m256i __A) { return (__m128i)__builtin_ia32_pmovswb256_mask( (__v16hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi16_storeu_epi8(void *__P, __mmask16 __M, __m256i __A) { +__funline void _mm256_mask_cvtsepi16_storeu_epi8(void *__P, __mmask16 __M, + __m256i __A) { __builtin_ia32_pmovswb256mem_mask((__v16qi *)__P, (__v16hi)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi16_epi8(__m128i __O, __mmask16 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtsepi16_epi8(__m128i __O, __mmask16 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovswb256_mask((__v16hi)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtsepi16_epi8(__mmask16 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtsepi16_epi8(__mmask16 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovswb256_mask( (__v16hi)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtusepi16_epi8(__m128i __A) { +__funline __m128i _mm_cvtusepi16_epi8(__m128i __A) { return (__m128i)__builtin_ia32_pmovuswb128_mask( (__v8hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi16_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtusepi16_storeu_epi8(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovuswb128mem_mask((__v8qi *)__P, (__v8hi)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtusepi16_epi8(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovuswb128_mask((__v8hi)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtusepi16_epi8(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtusepi16_epi8(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovuswb128_mask( (__v8hi)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtusepi16_epi8(__m256i __A) { +__funline __m128i _mm256_cvtusepi16_epi8(__m256i __A) { return (__m128i)__builtin_ia32_pmovuswb256_mask( (__v16hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask16)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi16_storeu_epi8(void *__P, __mmask16 __M, __m256i __A) { +__funline void _mm256_mask_cvtusepi16_storeu_epi8(void *__P, __mmask16 __M, + __m256i __A) { __builtin_ia32_pmovuswb256mem_mask((__v16qi *)__P, (__v16hi)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi16_epi8(__m128i __O, __mmask16 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtusepi16_epi8(__m128i __O, __mmask16 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovuswb256_mask((__v16hi)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtusepi16_epi8(__mmask16 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtusepi16_epi8(__mmask16 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovuswb256_mask( (__v16hi)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A) { +__funline __m256i _mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, + __m128i __A) { return (__m256i)__builtin_ia32_pbroadcastb256_mask((__v16qi)__A, (__v32qi)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) { +__funline __m256i _mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) { return (__m256i)__builtin_ia32_pbroadcastb256_mask( (__v16qi)__A, (__v32qi)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { +__funline __m256i _mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { return (__m256i)__builtin_ia32_pbroadcastb256_gpr_mask(__A, (__v32qi)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_set1_epi8(__mmask32 __M, char __A) { +__funline __m256i _mm256_maskz_set1_epi8(__mmask32 __M, char __A) { return (__m256i)__builtin_ia32_pbroadcastb256_gpr_mask( __A, (__v32qi)_mm256_setzero_si256(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, __m128i __A) { +__funline __m128i _mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pbroadcastb128_mask((__v16qi)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) { +__funline __m128i _mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) { return (__m128i)__builtin_ia32_pbroadcastb128_mask( (__v16qi)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_set1_epi8(__m128i __O, __mmask16 __M, char __A) { +__funline __m128i _mm_mask_set1_epi8(__m128i __O, __mmask16 __M, char __A) { return (__m128i)__builtin_ia32_pbroadcastb128_gpr_mask(__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_set1_epi8(__mmask16 __M, char __A) { +__funline __m128i _mm_maskz_set1_epi8(__mmask16 __M, char __A) { return (__m128i)__builtin_ia32_pbroadcastb128_gpr_mask( __A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A) { +__funline __m256i _mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, + __m128i __A) { return (__m256i)__builtin_ia32_pbroadcastw256_mask((__v8hi)__A, (__v16hi)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) { +__funline __m256i _mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) { return (__m256i)__builtin_ia32_pbroadcastw256_mask( (__v8hi)__A, (__v16hi)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) { +__funline __m256i _mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) { return (__m256i)__builtin_ia32_pbroadcastw256_gpr_mask(__A, (__v16hi)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_set1_epi16(__mmask16 __M, short __A) { +__funline __m256i _mm256_maskz_set1_epi16(__mmask16 __M, short __A) { return (__m256i)__builtin_ia32_pbroadcastw256_gpr_mask( __A, (__v16hi)_mm256_setzero_si256(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pbroadcastw128_mask((__v8hi)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pbroadcastw128_mask( (__v8hi)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) { +__funline __m128i _mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) { return (__m128i)__builtin_ia32_pbroadcastw128_gpr_mask(__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_set1_epi16(__mmask8 __M, short __A) { +__funline __m128i _mm_maskz_set1_epi16(__mmask8 __M, short __A) { return (__m128i)__builtin_ia32_pbroadcastw128_gpr_mask( __A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutexvar_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_permutexvar_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarhi256_mask( (__v16hi)__B, (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_permvarhi256_mask( (__v16hi)__B, (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) { +__funline __m256i _mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_permvarhi256_mask( (__v16hi)__B, (__v16hi)__A, (__v16hi)__W, (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permutexvar_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_permutexvar_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarhi128_mask( (__v8hi)__B, (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_permvarhi128_mask( (__v8hi)__B, (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { +__funline __m128i _mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, + __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_permvarhi128_mask((__v8hi)__B, (__v8hi)__A, (__v8hi)__W, (__mmask8)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) { +__funline __m256i _mm256_permutex2var_epi16(__m256i __A, __m256i __I, + __m256i __B) { return (__m256i)__builtin_ia32_vpermt2varhi256_mask( (__v16hi)__I /* idx */, (__v16hi)__A, (__v16hi)__B, (__mmask16)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, - __m256i __B) { +__funline __m256i _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, + __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermt2varhi256_mask( (__v16hi)__I /* idx */, (__v16hi)__A, (__v16hi)__B, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, - __m256i __B) { +__funline __m256i _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, + __mmask16 __U, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varhi256_mask((__v16hi)__A, (__v16hi)__I /* idx */, @@ -462,39 +357,31 @@ extern __inline __m256i (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I, - __m256i __B) { +__funline __m256i _mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, + __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermt2varhi256_maskz( (__v16hi)__I /* idx */, (__v16hi)__A, (__v16hi)__B, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) { +__funline __m128i _mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2varhi128_mask((__v8hi)__I /* idx */, (__v8hi)__A, (__v8hi)__B, (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) { +__funline __m128i _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, + __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2varhi128_mask((__v8hi)__I /* idx */, (__v8hi)__A, (__v8hi)__B, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) { +__funline __m128i _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, + __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varhi128_mask((__v8hi)__A, (__v8hi)__I /* idx */, @@ -502,776 +389,616 @@ extern __inline __m128i (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) { +__funline __m128i _mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, + __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2varhi128_maskz((__v8hi)__I /* idx */, (__v8hi)__A, (__v8hi)__B, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, - __m256i __Y) { +__funline __m256i _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, + __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_pmaddubsw256_mask( (__v32qi)__X, (__v32qi)__Y, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_pmaddubsw256_mask( (__v32qi)__X, (__v32qi)__Y, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_pmaddubsw128_mask((__v16qi)__X, (__v16qi)__Y, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_pmaddubsw128_mask( (__v16qi)__X, (__v16qi)__Y, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaddwd256_mask((__v16hi)__A, (__v16hi)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaddwd256_mask((__v16hi)__A, (__v16hi)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmaddwd128_mask((__v8hi)__A, (__v8hi)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaddwd128_mask( (__v8hi)__A, (__v8hi)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movepi8_mask(__m128i __A) { +__funline __mmask16 _mm_movepi8_mask(__m128i __A) { return (__mmask16)__builtin_ia32_cvtb2mask128((__v16qi)__A); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movepi8_mask(__m256i __A) { +__funline __mmask32 _mm256_movepi8_mask(__m256i __A) { return (__mmask32)__builtin_ia32_cvtb2mask256((__v32qi)__A); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movepi16_mask(__m128i __A) { +__funline __mmask8 _mm_movepi16_mask(__m128i __A) { return (__mmask8)__builtin_ia32_cvtw2mask128((__v8hi)__A); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movepi16_mask(__m256i __A) { +__funline __mmask16 _mm256_movepi16_mask(__m256i __A) { return (__mmask16)__builtin_ia32_cvtw2mask256((__v16hi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movm_epi8(__mmask16 __A) { +__funline __m128i _mm_movm_epi8(__mmask16 __A) { return (__m128i)__builtin_ia32_cvtmask2b128(__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movm_epi8(__mmask32 __A) { +__funline __m256i _mm256_movm_epi8(__mmask32 __A) { return (__m256i)__builtin_ia32_cvtmask2b256(__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movm_epi16(__mmask8 __A) { +__funline __m128i _mm_movm_epi16(__mmask8 __A) { return (__m128i)__builtin_ia32_cvtmask2w128(__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movm_epi16(__mmask16 __A) { +__funline __m256i _mm256_movm_epi16(__mmask16 __A) { return (__m256i)__builtin_ia32_cvtmask2w256(__A); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_test_epi8_mask(__m128i __A, __m128i __B) { +__funline __mmask16 _mm_test_epi8_mask(__m128i __A, __m128i __B) { return (__mmask16)__builtin_ia32_ptestmb128((__v16qi)__A, (__v16qi)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __mmask16 _mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, + __m128i __B) { return (__mmask16)__builtin_ia32_ptestmb128((__v16qi)__A, (__v16qi)__B, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_test_epi8_mask(__m256i __A, __m256i __B) { +__funline __mmask32 _mm256_test_epi8_mask(__m256i __A, __m256i __B) { return (__mmask32)__builtin_ia32_ptestmb256((__v32qi)__A, (__v32qi)__B, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __mmask32 _mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, + __m256i __B) { return (__mmask32)__builtin_ia32_ptestmb256((__v32qi)__A, (__v32qi)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_test_epi16_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_test_epi16_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ptestmw128((__v8hi)__A, (__v8hi)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_test_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_test_epi16_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ptestmw128((__v8hi)__A, (__v8hi)__B, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_test_epi16_mask(__m256i __A, __m256i __B) { +__funline __mmask16 _mm256_test_epi16_mask(__m256i __A, __m256i __B) { return (__mmask16)__builtin_ia32_ptestmw256((__v16hi)__A, (__v16hi)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_test_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __mmask16 _mm256_mask_test_epi16_mask(__mmask16 __U, __m256i __A, + __m256i __B) { return (__mmask16)__builtin_ia32_ptestmw256((__v16hi)__A, (__v16hi)__B, __U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pminuw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__M); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pminuw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminuw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pminuw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pminsw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__M); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pminsw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_max_epu8(__mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_max_epu8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxub256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaxub256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxub128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmaxub128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxsb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaxsb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxsb128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmaxsb128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_min_epu8(__mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_min_epu8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminub256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pminub256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminub128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pminub128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminsb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pminsb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminsb128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pminsb128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaxsw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__M); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaxsw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxsw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmaxsw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaxuw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__M); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaxuw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxuw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmaxuw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminsw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pminsw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__M); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B, const int __N) { +__funline __m256i _mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B, const int __N) { return (__m256i)__builtin_ia32_palignr256_mask( (__v4di)__A, (__v4di)__B, __N * 8, (__v4di)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B, - const int __N) { +__funline __m256i _mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, + __m256i __B, const int __N) { return (__m256i)__builtin_ia32_palignr256_mask( (__v4di)__A, (__v4di)__B, __N * 8, (__v4di)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B, - const int __N) { +__funline __m128i _mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B, const int __N) { return (__m128i)__builtin_ia32_palignr128_mask( (__v2di)__A, (__v2di)__B, __N * 8, (__v2di)__W, (__mmask16)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B, const int __N) { +__funline __m128i _mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B, + const int __N) { return (__m128i)__builtin_ia32_palignr128_mask( (__v2di)__A, (__v2di)__B, __N * 8, (__v2di)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_dbsad_epu8(__m256i __A, __m256i __B, const int __imm) { +__funline __m256i _mm256_dbsad_epu8(__m256i __A, __m256i __B, const int __imm) { return (__m256i)__builtin_ia32_dbpsadbw256_mask( (__v32qi)__A, (__v32qi)__B, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B, - const int __imm) { +__funline __m256i _mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B, const int __imm) { return (__m256i)__builtin_ia32_dbpsadbw256_mask( (__v32qi)__A, (__v32qi)__B, __imm, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B, - const int __imm) { +__funline __m256i _mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B, + const int __imm) { return (__m256i)__builtin_ia32_dbpsadbw256_mask( (__v32qi)__A, (__v32qi)__B, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_dbsad_epu8(__m128i __A, __m128i __B, const int __imm) { +__funline __m128i _mm_dbsad_epu8(__m128i __A, __m128i __B, const int __imm) { return (__m128i)__builtin_ia32_dbpsadbw128_mask( (__v16qi)__A, (__v16qi)__B, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B, - const int __imm) { +__funline __m128i _mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) { return (__m128i)__builtin_ia32_dbpsadbw128_mask( (__v16qi)__A, (__v16qi)__B, __imm, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B, const int __imm) { +__funline __m128i _mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) { return (__m128i)__builtin_ia32_dbpsadbw128_mask( (__v16qi)__A, (__v16qi)__B, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) { +__funline __m128i _mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) { return (__m128i)__builtin_ia32_blendmw_128_mask((__v8hi)__A, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) { +__funline __m128i _mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) { return (__m128i)__builtin_ia32_blendmb_128_mask((__v16qi)__A, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) { +__funline __m256i _mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, + __m256i __W) { return (__m256i)__builtin_ia32_blendmw_256_mask((__v16hi)__A, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) { +__funline __m256i _mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, + __m256i __W) { return (__m256i)__builtin_ia32_blendmb_256_mask((__v32qi)__A, (__v32qi)__W, (__mmask32)__U); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_cmp_epi16_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_mask_cmp_epi16_mask(__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_epi16_mask(__m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_cmp_epi16_mask(__m128i __X, __m128i __Y, const int __P) { return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmp_epi16_mask(__mmask16 __U, __m256i __X, __m256i __Y, - const int __P) { +__funline __mmask16 _mm256_mask_cmp_epi16_mask(__mmask16 __U, __m256i __X, + __m256i __Y, const int __P) { return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, __P, (__mmask16)__U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_epi16_mask(__m256i __X, __m256i __Y, const int __P) { +__funline __mmask16 _mm256_cmp_epi16_mask(__m256i __X, __m256i __Y, + const int __P) { return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, __P, (__mmask16)-1); } -extern __inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_cmp_epi8_mask(__mmask16 __U, __m128i __X, __m128i __Y, const int __P) { +__funline __mmask16 _mm_mask_cmp_epi8_mask(__mmask16 __U, __m128i __X, + __m128i __Y, const int __P) { return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, __P, (__mmask16)__U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_epi8_mask(__m128i __X, __m128i __Y, const int __P) { +__funline __mmask16 _mm_cmp_epi8_mask(__m128i __X, __m128i __Y, const int __P) { return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, __P, (__mmask16)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmp_epi8_mask(__mmask32 __U, __m256i __X, __m256i __Y, - const int __P) { +__funline __mmask32 _mm256_mask_cmp_epi8_mask(__mmask32 __U, __m256i __X, + __m256i __Y, const int __P) { return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, __P, (__mmask32)__U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_epi8_mask(__m256i __X, __m256i __Y, const int __P) { +__funline __mmask32 _mm256_cmp_epi8_mask(__m256i __X, __m256i __Y, + const int __P) { return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, __P, (__mmask32)-1); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_cmp_epu16_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_mask_cmp_epu16_mask(__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_epu16_mask(__m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_cmp_epu16_mask(__m128i __X, __m128i __Y, const int __P) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, __P, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmp_epu16_mask(__mmask16 __U, __m256i __X, __m256i __Y, - const int __P) { +__funline __mmask16 _mm256_mask_cmp_epu16_mask(__mmask16 __U, __m256i __X, + __m256i __Y, const int __P) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, __P, (__mmask16)__U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_epu16_mask(__m256i __X, __m256i __Y, const int __P) { +__funline __mmask16 _mm256_cmp_epu16_mask(__m256i __X, __m256i __Y, + const int __P) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, __P, (__mmask16)-1); } -extern __inline __mmask16 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_cmp_epu8_mask(__mmask16 __U, __m128i __X, __m128i __Y, const int __P) { +__funline __mmask16 _mm_mask_cmp_epu8_mask(__mmask16 __U, __m128i __X, + __m128i __Y, const int __P) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, __P, (__mmask16)__U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_epu8_mask(__m128i __X, __m128i __Y, const int __P) { +__funline __mmask16 _mm_cmp_epu8_mask(__m128i __X, __m128i __Y, const int __P) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, __P, (__mmask16)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmp_epu8_mask(__mmask32 __U, __m256i __X, __m256i __Y, - const int __P) { +__funline __mmask32 _mm256_mask_cmp_epu8_mask(__mmask32 __U, __m256i __X, + __m256i __Y, const int __P) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, __P, (__mmask32)__U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_epu8_mask(__m256i __X, __m256i __Y, const int __P) { +__funline __mmask32 _mm256_cmp_epu8_mask(__m256i __X, __m256i __Y, + const int __P) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, __P, (__mmask32)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, - const int __imm) { +__funline __m256i _mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psrlwi256_mask((__v16hi)__A, __imm, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, const int __imm) { +__funline __m256i _mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psrlwi256_mask( (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psrlwi128_mask((__v8hi)__A, __imm, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srli_epi16(__mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_maskz_srli_epi16(__mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psrlwi128_mask( (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shufflehi_epi16(__m256i __W, __mmask16 __U, __m256i __A, - const int __imm) { +__funline __m256i _mm256_mask_shufflehi_epi16(__m256i __W, __mmask16 __U, + __m256i __A, const int __imm) { return (__m256i)__builtin_ia32_pshufhw256_mask((__v16hi)__A, __imm, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shufflehi_epi16(__mmask16 __U, __m256i __A, const int __imm) { +__funline __m256i _mm256_maskz_shufflehi_epi16(__mmask16 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_pshufhw256_mask( (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shufflehi_epi16(__m128i __W, __mmask8 __U, __m128i __A, - const int __imm) { +__funline __m128i _mm_mask_shufflehi_epi16(__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_pshufhw128_mask((__v8hi)__A, __imm, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shufflehi_epi16(__mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_maskz_shufflehi_epi16(__mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_pshufhw128_mask( (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shufflelo_epi16(__m256i __W, __mmask16 __U, __m256i __A, - const int __imm) { +__funline __m256i _mm256_mask_shufflelo_epi16(__m256i __W, __mmask16 __U, + __m256i __A, const int __imm) { return (__m256i)__builtin_ia32_pshuflw256_mask((__v16hi)__A, __imm, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A, const int __imm) { +__funline __m256i _mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_pshuflw256_mask( (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shufflelo_epi16(__m128i __W, __mmask8 __U, __m128i __A, - const int __imm) { +__funline __m128i _mm_mask_shufflelo_epi16(__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_pshuflw128_mask((__v8hi)__A, __imm, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shufflelo_epi16(__mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_maskz_shufflelo_epi16(__mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_pshuflw128_mask( (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, - const int __imm) { +__funline __m256i _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psrawi256_mask((__v16hi)__A, __imm, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, const int __imm) { +__funline __m256i _mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psrawi256_mask( (__v16hi)__A, __imm, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psrawi128_mask((__v8hi)__A, __imm, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psrawi128_mask( (__v8hi)__A, __imm, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) { +__funline __m256i _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, + int __B) { return (__m256i)__builtin_ia32_psllwi256_mask((__v16hi)__A, __B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B) { +__funline __m256i _mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_psllwi256_mask( (__v16hi)__A, __B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) { +__funline __m128i _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, + int __B) { return (__m128i)__builtin_ia32_psllwi128_mask((__v8hi)__A, __B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_slli_epi16(__mmask8 __U, __m128i __A, int __B) { +__funline __m128i _mm_maskz_slli_epi16(__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_psllwi128_mask( (__v8hi)__A, __B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } @@ -1503,1879 +1230,1504 @@ extern __inline __m128i (__v32qi)(__m256i)(X), (__v32qi)(__m256i)(Y), (int)(P), (__mmask32)M)) #endif -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpneq_epi8_mask(__m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_cmpneq_epi8_mask(__m256i __X, __m256i __Y) { return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmplt_epi8_mask(__m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_cmplt_epi8_mask(__m256i __X, __m256i __Y) { return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpge_epi8_mask(__m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_cmpge_epi8_mask(__m256i __X, __m256i __Y) { return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmple_epi8_mask(__m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_cmple_epi8_mask(__m256i __X, __m256i __Y) { return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, (__mmask32)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpneq_epi16_mask(__m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_cmpneq_epi16_mask(__m256i __X, __m256i __Y) { return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmplt_epi16_mask(__m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_cmplt_epi16_mask(__m256i __X, __m256i __Y) { return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpge_epi16_mask(__m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_cmpge_epi16_mask(__m256i __X, __m256i __Y) { return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmple_epi16_mask(__m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_cmple_epi16_mask(__m256i __X, __m256i __Y) { return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_epu8_mask(__m128i __X, __m128i __Y) { +__funline __mmask16 _mm_cmpneq_epu8_mask(__m128i __X, __m128i __Y) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epu8_mask(__m128i __X, __m128i __Y) { +__funline __mmask16 _mm_cmplt_epu8_mask(__m128i __X, __m128i __Y) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_epu8_mask(__m128i __X, __m128i __Y) { +__funline __mmask16 _mm_cmpge_epu8_mask(__m128i __X, __m128i __Y) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_epu8_mask(__m128i __X, __m128i __Y) { +__funline __mmask16 _mm_cmple_epu8_mask(__m128i __X, __m128i __Y) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, (__mmask16)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_epu16_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpneq_epu16_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epu16_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmplt_epu16_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_epu16_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpge_epu16_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_epu16_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmple_epu16_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, (__mmask8)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_epi8_mask(__m128i __X, __m128i __Y) { +__funline __mmask16 _mm_cmpneq_epi8_mask(__m128i __X, __m128i __Y) { return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epi8_mask(__m128i __X, __m128i __Y) { +__funline __mmask16 _mm_cmplt_epi8_mask(__m128i __X, __m128i __Y) { return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_epi8_mask(__m128i __X, __m128i __Y) { +__funline __mmask16 _mm_cmpge_epi8_mask(__m128i __X, __m128i __Y) { return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_epi8_mask(__m128i __X, __m128i __Y) { +__funline __mmask16 _mm_cmple_epi8_mask(__m128i __X, __m128i __Y) { return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, (__mmask16)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_epi16_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpneq_epi16_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epi16_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmplt_epi16_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_epi16_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpge_epi16_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_epi16_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmple_epi16_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, (__mmask8)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, + __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_pmulhrsw256_mask((__v16hi)__X, (__v16hi)__Y, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_pmulhrsw256_mask( (__v16hi)__X, (__v16hi)__Y, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmulhuw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmulhuw256_mask( (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmulhw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmulhw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmulhw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmulhw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmulhuw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmulhuw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_pmulhrsw128_mask((__v8hi)__X, (__v8hi)__Y, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pmulhrsw128_mask( (__v8hi)__X, (__v8hi)__Y, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmullw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmullw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmullw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmullw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { +__funline __m256i _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, + __m128i __A) { return (__m256i)__builtin_ia32_pmovsxbw256_mask((__v16qi)__A, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { +__funline __m256i _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_pmovsxbw256_mask( (__v16qi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovsxbw128_mask((__v16qi)__A, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovsxbw128_mask( (__v16qi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { +__funline __m256i _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, + __m128i __A) { return (__m256i)__builtin_ia32_pmovzxbw256_mask((__v16qi)__A, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) { +__funline __m256i _mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_pmovzxbw256_mask( (__v16qi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovzxbw128_mask((__v16qi)__A, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovzxbw128_mask( (__v16qi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pavgb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pavgb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pavgb128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pavgb128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pavgw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pavgw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pavgw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pavgw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_paddb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddsb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddsb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddsw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddsw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddusb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddusb256_mask( (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddusw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddusw256_mask( (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psubb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubsb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubsb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubsw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubsw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubusb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubusb256_mask( (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubusw256_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubusw256_mask( (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_paddb128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddb128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_paddw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) { +__funline __m256i _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpckhbw256_mask( (__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_punpckhbw256_mask( (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpckhbw128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpckhbw128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { +__funline __m256i _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpckhwd256_mask( (__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_punpckhwd256_mask( (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpckhwd128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpckhwd128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, - __m256i __B) { +__funline __m256i _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpcklbw256_mask( (__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_punpcklbw256_mask( (__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpcklbw128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpcklbw128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { +__funline __m256i _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpcklwd256_mask( (__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_punpcklwd256_mask( (__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpcklwd128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpcklwd128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epi8_mask(__m128i __A, __m128i __B) { +__funline __mmask16 _mm_cmpeq_epi8_mask(__m128i __A, __m128i __B) { return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__A, (__v16qi)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epu8_mask(__m128i __A, __m128i __B) { +__funline __mmask16 _mm_cmpeq_epu8_mask(__m128i __A, __m128i __B) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 0, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpeq_epu8_mask(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __mmask16 _mm_mask_cmpeq_epu8_mask(__mmask16 __U, __m128i __A, + __m128i __B) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 0, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpeq_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __mmask16 _mm_mask_cmpeq_epi8_mask(__mmask16 __U, __m128i __A, + __m128i __B) { return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__A, (__v16qi)__B, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epu8_mask(__m256i __A, __m256i __B) { +__funline __mmask32 _mm256_cmpeq_epu8_mask(__m256i __A, __m256i __B) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 0, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epi8_mask(__m256i __A, __m256i __B) { +__funline __mmask32 _mm256_cmpeq_epi8_mask(__m256i __A, __m256i __B) { return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__A, (__v32qi)__B, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpeq_epu8_mask(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __mmask32 _mm256_mask_cmpeq_epu8_mask(__mmask32 __U, __m256i __A, + __m256i __B) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 0, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpeq_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __mmask32 _mm256_mask_cmpeq_epi8_mask(__mmask32 __U, __m256i __A, + __m256i __B) { return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__A, (__v32qi)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epu16_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpeq_epu16_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 0, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epi16_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpeq_epi16_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__A, (__v8hi)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpeq_epu16_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpeq_epu16_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 0, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpeq_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpeq_epi16_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__A, (__v8hi)__B, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epu16_mask(__m256i __A, __m256i __B) { +__funline __mmask16 _mm256_cmpeq_epu16_mask(__m256i __A, __m256i __B) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 0, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epi16_mask(__m256i __A, __m256i __B) { +__funline __mmask16 _mm256_cmpeq_epi16_mask(__m256i __A, __m256i __B) { return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__A, (__v16hi)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpeq_epu16_mask(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __mmask16 _mm256_mask_cmpeq_epu16_mask(__mmask16 __U, __m256i __A, + __m256i __B) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 0, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpeq_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __mmask16 _mm256_mask_cmpeq_epi16_mask(__mmask16 __U, __m256i __A, + __m256i __B) { return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__A, (__v16hi)__B, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epu8_mask(__m128i __A, __m128i __B) { +__funline __mmask16 _mm_cmpgt_epu8_mask(__m128i __A, __m128i __B) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 6, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epi8_mask(__m128i __A, __m128i __B) { +__funline __mmask16 _mm_cmpgt_epi8_mask(__m128i __A, __m128i __B) { return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__A, (__v16qi)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpgt_epu8_mask(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __mmask16 _mm_mask_cmpgt_epu8_mask(__mmask16 __U, __m128i __A, + __m128i __B) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__A, (__v16qi)__B, 6, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpgt_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __mmask16 _mm_mask_cmpgt_epi8_mask(__mmask16 __U, __m128i __A, + __m128i __B) { return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__A, (__v16qi)__B, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epu8_mask(__m256i __A, __m256i __B) { +__funline __mmask32 _mm256_cmpgt_epu8_mask(__m256i __A, __m256i __B) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 6, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epi8_mask(__m256i __A, __m256i __B) { +__funline __mmask32 _mm256_cmpgt_epi8_mask(__m256i __A, __m256i __B) { return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__A, (__v32qi)__B, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpgt_epu8_mask(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __mmask32 _mm256_mask_cmpgt_epu8_mask(__mmask32 __U, __m256i __A, + __m256i __B) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__A, (__v32qi)__B, 6, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpgt_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __mmask32 _mm256_mask_cmpgt_epi8_mask(__mmask32 __U, __m256i __A, + __m256i __B) { return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__A, (__v32qi)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epu16_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpgt_epu16_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 6, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epi16_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpgt_epi16_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__A, (__v8hi)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpgt_epu16_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpgt_epu16_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__A, (__v8hi)__B, 6, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpgt_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpgt_epi16_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__A, (__v8hi)__B, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epu16_mask(__m256i __A, __m256i __B) { +__funline __mmask16 _mm256_cmpgt_epu16_mask(__m256i __A, __m256i __B) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 6, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epi16_mask(__m256i __A, __m256i __B) { +__funline __mmask16 _mm256_cmpgt_epi16_mask(__m256i __A, __m256i __B) { return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__A, (__v16hi)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpgt_epu16_mask(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __mmask16 _mm256_mask_cmpgt_epu16_mask(__mmask16 __U, __m256i __A, + __m256i __B) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__A, (__v16hi)__B, 6, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpgt_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __mmask16 _mm256_mask_cmpgt_epi16_mask(__mmask16 __U, __m256i __A, + __m256i __B) { return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__A, (__v16hi)__B, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testn_epi8_mask(__m128i __A, __m128i __B) { +__funline __mmask16 _mm_testn_epi8_mask(__m128i __A, __m128i __B) { return (__mmask16)__builtin_ia32_ptestnmb128((__v16qi)__A, (__v16qi)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __mmask16 _mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, + __m128i __B) { return (__mmask16)__builtin_ia32_ptestnmb128((__v16qi)__A, (__v16qi)__B, __U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testn_epi8_mask(__m256i __A, __m256i __B) { +__funline __mmask32 _mm256_testn_epi8_mask(__m256i __A, __m256i __B) { return (__mmask32)__builtin_ia32_ptestnmb256((__v32qi)__A, (__v32qi)__B, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_testn_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __mmask32 _mm256_mask_testn_epi8_mask(__mmask32 __U, __m256i __A, + __m256i __B) { return (__mmask32)__builtin_ia32_ptestnmb256((__v32qi)__A, (__v32qi)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testn_epi16_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_testn_epi16_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ptestnmw128((__v8hi)__A, (__v8hi)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_testn_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_testn_epi16_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ptestnmw128((__v8hi)__A, (__v8hi)__B, __U); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testn_epi16_mask(__m256i __A, __m256i __B) { +__funline __mmask16 _mm256_testn_epi16_mask(__m256i __A, __m256i __B) { return (__mmask16)__builtin_ia32_ptestnmw256((__v16hi)__A, (__v16hi)__B, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_testn_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __mmask16 _mm256_mask_testn_epi16_mask(__mmask16 __U, __m256i __A, + __m256i __B) { return (__mmask16)__builtin_ia32_ptestnmw256((__v16hi)__A, (__v16hi)__B, __U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pshufb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pshufb256_mask((__v32qi)__A, (__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pshufb128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pshufb128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_packsswb256_mask( (__v16hi)__A, (__v16hi)__B, (__v32qi)_mm256_setzero_si256(), __M); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_packsswb256_mask((__v16hi)__A, (__v16hi)__B, (__v32qi)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_packsswb128_mask( (__v8hi)__A, (__v8hi)__B, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_packsswb128_mask((__v8hi)__A, (__v8hi)__B, (__v16qi)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_packuswb256_mask( (__v16hi)__A, (__v16hi)__B, (__v32qi)_mm256_setzero_si256(), __M); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_packuswb256_mask((__v16hi)__A, (__v16hi)__B, (__v32qi)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_packuswb128_mask( (__v8hi)__A, (__v8hi)__B, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_packuswb128_mask((__v8hi)__A, (__v8hi)__B, (__v16qi)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) { +__funline __m256i _mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) { return (__m256i)__builtin_ia32_pabsb256_mask((__v32qi)__A, (__v32qi)__W, (__mmask32)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_abs_epi8(__mmask32 __U, __m256i __A) { +__funline __m256i _mm256_maskz_abs_epi8(__mmask32 __U, __m256i __A) { return (__m256i)__builtin_ia32_pabsb256_mask( (__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { +__funline __m128i _mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { return (__m128i)__builtin_ia32_pabsb128_mask((__v16qi)__A, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) { +__funline __m128i _mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) { return (__m128i)__builtin_ia32_pabsb128_mask( (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) { +__funline __m256i _mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) { return (__m256i)__builtin_ia32_pabsw256_mask((__v16hi)__A, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { +__funline __m256i _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { return (__m256i)__builtin_ia32_pabsw256_mask( (__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pabsw128_mask((__v8hi)__A, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pabsw128_mask( (__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpneq_epu8_mask(__m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_cmpneq_epu8_mask(__m256i __X, __m256i __Y) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmplt_epu8_mask(__m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_cmplt_epu8_mask(__m256i __X, __m256i __Y) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpge_epu8_mask(__m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_cmpge_epu8_mask(__m256i __X, __m256i __Y) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, (__mmask32)-1); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmple_epu8_mask(__m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_cmple_epu8_mask(__m256i __X, __m256i __Y) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, (__mmask32)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpneq_epu16_mask(__m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_cmpneq_epu16_mask(__m256i __X, __m256i __Y) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmplt_epu16_mask(__m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_cmplt_epu16_mask(__m256i __X, __m256i __Y) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpge_epu16_mask(__m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_cmpge_epu16_mask(__m256i __X, __m256i __Y) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, (__mmask16)-1); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmple_epu16_mask(__m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_cmple_epu16_mask(__m256i __X, __m256i __Y) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, (__mmask16)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_storeu_epi16(void *__P, __mmask16 __U, __m256i __A) { +__funline void _mm256_mask_storeu_epi16(void *__P, __mmask16 __U, __m256i __A) { __builtin_ia32_storedquhi256_mask((short *)__P, (__v16hi)__A, (__mmask16)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_storeu_epi16(void *__P, __mmask8 __U, __m128i __A) { +__funline void _mm_mask_storeu_epi16(void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_storedquhi128_mask((short *)__P, (__v8hi)__A, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_paddsw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psubsb128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubsb128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psubsw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubsw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psubusb128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubusb128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psubusw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubusw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psrlw256_mask((__v16hi)__A, (__v8hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psrlw256_mask((__v16hi)__A, (__v8hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psrlw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srl_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_srl_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrlw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psraw256_mask((__v16hi)__A, (__v8hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psraw256_mask((__v16hi)__A, (__v8hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psraw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psraw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddsw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_paddusb128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddusb128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_paddusw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddusw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psubb128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubb128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psubw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_paddsb128_mask((__v16qi)__A, (__v16qi)__B, (__v16qi)__W, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddsb128_mask( (__v16qi)__A, (__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi16_epi8(__m128i __A) { +__funline __m128i _mm_cvtepi16_epi8(__m128i __A) { return (__m128i)__builtin_ia32_pmovwb128_mask( (__v8hi)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi16_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtepi16_storeu_epi8(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovwb128mem_mask((__v8qi *)__P, (__v8hi)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovwb128_mask((__v8hi)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi16_epi8(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi16_epi8(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovwb128_mask( (__v8hi)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srav_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_srav_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psrav16hi_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psrav16hi_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psrav16hi_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srav_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_srav_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrav8hi_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psrav8hi_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrav8hi_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srlv_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_srlv_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psrlv16hi_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psrlv16hi_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psrlv16hi_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srlv_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_srlv_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrlv8hi_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psrlv8hi_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrlv8hi_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sllv_epi16(__m256i __A, __m256i __B) { +__funline __m256i _mm256_sllv_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psllv16hi_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psllv16hi_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psllv16hi_mask((__v16hi)__A, (__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sllv_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_sllv_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psllv8hi_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psllv8hi_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psllv8hi_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psllw128_mask((__v8hi)__A, (__v8hi)__B, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sll_epi16(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sll_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psllw128_mask( (__v8hi)__A, (__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psllw256_mask((__v16hi)__A, (__v8hi)__B, (__v16hi)__W, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psllw256_mask((__v16hi)__A, (__v8hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_packusdw256_mask( (__v8si)__A, (__v8si)__B, (__v16hi)_mm256_setzero_si256(), __M); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_packusdw256_mask((__v8si)__A, (__v8si)__B, (__v16hi)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_packusdw128_mask( (__v4si)__A, (__v4si)__B, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_packusdw128_mask((__v4si)__A, (__v4si)__B, (__v8hi)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_packssdw256_mask( (__v8si)__A, (__v8si)__B, (__v16hi)_mm256_setzero_si256(), __M); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_packssdw256_mask((__v8si)__A, (__v8si)__B, (__v16hi)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_packssdw128_mask( (__v4si)__A, (__v4si)__B, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_packssdw128_mask((__v4si)__A, (__v4si)__B, (__v8hi)__W, __M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpneq_epu8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { +__funline __mmask16 _mm_mask_cmpneq_epu8_mask(__mmask16 __M, __m128i __X, + __m128i __Y) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmplt_epu8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { +__funline __mmask16 _mm_mask_cmplt_epu8_mask(__mmask16 __M, __m128i __X, + __m128i __Y) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpge_epu8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { +__funline __mmask16 _mm_mask_cmpge_epu8_mask(__mmask16 __M, __m128i __X, + __m128i __Y) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmple_epu8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { +__funline __mmask16 _mm_mask_cmple_epu8_mask(__mmask16 __M, __m128i __X, + __m128i __Y) { return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, (__mmask16)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpneq_epu16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpneq_epu16_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmplt_epu16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmplt_epu16_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpge_epu16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpge_epu16_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmple_epu16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmple_epu16_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, (__mmask8)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpneq_epi8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { +__funline __mmask16 _mm_mask_cmpneq_epi8_mask(__mmask16 __M, __m128i __X, + __m128i __Y) { return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 4, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmplt_epi8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { +__funline __mmask16 _mm_mask_cmplt_epi8_mask(__mmask16 __M, __m128i __X, + __m128i __Y) { return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 1, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpge_epi8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { +__funline __mmask16 _mm_mask_cmpge_epi8_mask(__mmask16 __M, __m128i __X, + __m128i __Y) { return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 5, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmple_epi8_mask(__mmask16 __M, __m128i __X, __m128i __Y) { +__funline __mmask16 _mm_mask_cmple_epi8_mask(__mmask16 __M, __m128i __X, + __m128i __Y) { return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__X, (__v16qi)__Y, 2, (__mmask16)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpneq_epi16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpneq_epi16_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmplt_epi16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmplt_epi16_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpge_epi16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpge_epi16_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmple_epi16_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmple_epi16_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__X, (__v8hi)__Y, 2, (__mmask8)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpneq_epu8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_mask_cmpneq_epu8_mask(__mmask32 __M, __m256i __X, + __m256i __Y) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmplt_epu8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_mask_cmplt_epu8_mask(__mmask32 __M, __m256i __X, + __m256i __Y) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpge_epu8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_mask_cmpge_epu8_mask(__mmask32 __M, __m256i __X, + __m256i __Y) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmple_epu8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_mask_cmple_epu8_mask(__mmask32 __M, __m256i __X, + __m256i __Y) { return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, (__mmask32)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpneq_epu16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_mask_cmpneq_epu16_mask(__mmask16 __M, __m256i __X, + __m256i __Y) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmplt_epu16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_mask_cmplt_epu16_mask(__mmask16 __M, __m256i __X, + __m256i __Y) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpge_epu16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_mask_cmpge_epu16_mask(__mmask16 __M, __m256i __X, + __m256i __Y) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmple_epu16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_mask_cmple_epu16_mask(__mmask16 __M, __m256i __X, + __m256i __Y) { return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, (__mmask16)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpneq_epi8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_mask_cmpneq_epi8_mask(__mmask32 __M, __m256i __X, + __m256i __Y) { return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 4, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmplt_epi8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_mask_cmplt_epi8_mask(__mmask32 __M, __m256i __X, + __m256i __Y) { return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 1, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpge_epi8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_mask_cmpge_epi8_mask(__mmask32 __M, __m256i __X, + __m256i __Y) { return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 5, (__mmask32)__M); } -extern __inline __mmask32 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmple_epi8_mask(__mmask32 __M, __m256i __X, __m256i __Y) { +__funline __mmask32 _mm256_mask_cmple_epi8_mask(__mmask32 __M, __m256i __X, + __m256i __Y) { return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__X, (__v32qi)__Y, 2, (__mmask32)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpneq_epi16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_mask_cmpneq_epi16_mask(__mmask16 __M, __m256i __X, + __m256i __Y) { return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 4, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmplt_epi16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_mask_cmplt_epi16_mask(__mmask16 __M, __m256i __X, + __m256i __Y) { return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 1, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpge_epi16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_mask_cmpge_epi16_mask(__mmask16 __M, __m256i __X, + __m256i __Y) { return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 5, (__mmask16)__M); } -extern __inline __mmask16 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmple_epi16_mask(__mmask16 __M, __m256i __X, __m256i __Y) { +__funline __mmask16 _mm256_mask_cmple_epi16_mask(__mmask16 __M, __m256i __X, + __m256i __Y) { return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__X, (__v16hi)__Y, 2, (__mmask16)__M); } diff --git a/third_party/intel/avx512vldqintrin.internal.h b/third_party/intel/avx512vldqintrin.internal.h index 89fe84c15..c3ac74c67 100644 --- a/third_party/intel/avx512vldqintrin.internal.h +++ b/third_party/intel/avx512vldqintrin.internal.h @@ -11,1242 +11,935 @@ #define __DISABLE_AVX512VLDQ__ #endif /* __AVX512VLDQ__ */ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvttpd_epi64(__m256d __A) { +__funline __m256i _mm256_cvttpd_epi64(__m256d __A) { return (__m256i)__builtin_ia32_cvttpd2qq256_mask( (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvttpd_epi64(__m256i __W, __mmask8 __U, __m256d __A) { +__funline __m256i _mm256_mask_cvttpd_epi64(__m256i __W, __mmask8 __U, + __m256d __A) { return (__m256i)__builtin_ia32_cvttpd2qq256_mask((__v4df)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvttpd_epi64(__mmask8 __U, __m256d __A) { +__funline __m256i _mm256_maskz_cvttpd_epi64(__mmask8 __U, __m256d __A) { return (__m256i)__builtin_ia32_cvttpd2qq256_mask( (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttpd_epi64(__m128d __A) { +__funline __m128i _mm_cvttpd_epi64(__m128d __A) { return (__m128i)__builtin_ia32_cvttpd2qq128_mask( (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvttpd_epi64(__m128i __W, __mmask8 __U, __m128d __A) { +__funline __m128i _mm_mask_cvttpd_epi64(__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvttpd2qq128_mask((__v2df)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvttpd_epi64(__mmask8 __U, __m128d __A) { +__funline __m128i _mm_maskz_cvttpd_epi64(__mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvttpd2qq128_mask( (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvttpd_epu64(__m256d __A) { +__funline __m256i _mm256_cvttpd_epu64(__m256d __A) { return (__m256i)__builtin_ia32_cvttpd2uqq256_mask( (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvttpd_epu64(__m256i __W, __mmask8 __U, __m256d __A) { +__funline __m256i _mm256_mask_cvttpd_epu64(__m256i __W, __mmask8 __U, + __m256d __A) { return (__m256i)__builtin_ia32_cvttpd2uqq256_mask((__v4df)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvttpd_epu64(__mmask8 __U, __m256d __A) { +__funline __m256i _mm256_maskz_cvttpd_epu64(__mmask8 __U, __m256d __A) { return (__m256i)__builtin_ia32_cvttpd2uqq256_mask( (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttpd_epu64(__m128d __A) { +__funline __m128i _mm_cvttpd_epu64(__m128d __A) { return (__m128i)__builtin_ia32_cvttpd2uqq128_mask( (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvttpd_epu64(__m128i __W, __mmask8 __U, __m128d __A) { +__funline __m128i _mm_mask_cvttpd_epu64(__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvttpd2uqq128_mask((__v2df)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvttpd_epu64(__mmask8 __U, __m128d __A) { +__funline __m128i _mm_maskz_cvttpd_epu64(__mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvttpd2uqq128_mask( (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtpd_epi64(__m256d __A) { +__funline __m256i _mm256_cvtpd_epi64(__m256d __A) { return (__m256i)__builtin_ia32_cvtpd2qq256_mask( (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtpd_epi64(__m256i __W, __mmask8 __U, __m256d __A) { +__funline __m256i _mm256_mask_cvtpd_epi64(__m256i __W, __mmask8 __U, + __m256d __A) { return (__m256i)__builtin_ia32_cvtpd2qq256_mask((__v4df)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtpd_epi64(__mmask8 __U, __m256d __A) { +__funline __m256i _mm256_maskz_cvtpd_epi64(__mmask8 __U, __m256d __A) { return (__m256i)__builtin_ia32_cvtpd2qq256_mask( (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpd_epi64(__m128d __A) { +__funline __m128i _mm_cvtpd_epi64(__m128d __A) { return (__m128i)__builtin_ia32_cvtpd2qq128_mask( (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtpd_epi64(__m128i __W, __mmask8 __U, __m128d __A) { +__funline __m128i _mm_mask_cvtpd_epi64(__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvtpd2qq128_mask((__v2df)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtpd_epi64(__mmask8 __U, __m128d __A) { +__funline __m128i _mm_maskz_cvtpd_epi64(__mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvtpd2qq128_mask( (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtpd_epu64(__m256d __A) { +__funline __m256i _mm256_cvtpd_epu64(__m256d __A) { return (__m256i)__builtin_ia32_cvtpd2uqq256_mask( (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtpd_epu64(__m256i __W, __mmask8 __U, __m256d __A) { +__funline __m256i _mm256_mask_cvtpd_epu64(__m256i __W, __mmask8 __U, + __m256d __A) { return (__m256i)__builtin_ia32_cvtpd2uqq256_mask((__v4df)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtpd_epu64(__mmask8 __U, __m256d __A) { +__funline __m256i _mm256_maskz_cvtpd_epu64(__mmask8 __U, __m256d __A) { return (__m256i)__builtin_ia32_cvtpd2uqq256_mask( (__v4df)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpd_epu64(__m128d __A) { +__funline __m128i _mm_cvtpd_epu64(__m128d __A) { return (__m128i)__builtin_ia32_cvtpd2uqq128_mask( (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtpd_epu64(__m128i __W, __mmask8 __U, __m128d __A) { +__funline __m128i _mm_mask_cvtpd_epu64(__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvtpd2uqq128_mask((__v2df)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtpd_epu64(__mmask8 __U, __m128d __A) { +__funline __m128i _mm_maskz_cvtpd_epu64(__mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvtpd2uqq128_mask( (__v2df)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvttps_epi64(__m128 __A) { +__funline __m256i _mm256_cvttps_epi64(__m128 __A) { return (__m256i)__builtin_ia32_cvttps2qq256_mask( (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvttps_epi64(__m256i __W, __mmask8 __U, __m128 __A) { +__funline __m256i _mm256_mask_cvttps_epi64(__m256i __W, __mmask8 __U, + __m128 __A) { return (__m256i)__builtin_ia32_cvttps2qq256_mask((__v4sf)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvttps_epi64(__mmask8 __U, __m128 __A) { +__funline __m256i _mm256_maskz_cvttps_epi64(__mmask8 __U, __m128 __A) { return (__m256i)__builtin_ia32_cvttps2qq256_mask( (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttps_epi64(__m128 __A) { +__funline __m128i _mm_cvttps_epi64(__m128 __A) { return (__m128i)__builtin_ia32_cvttps2qq128_mask( (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvttps_epi64(__m128i __W, __mmask8 __U, __m128 __A) { +__funline __m128i _mm_mask_cvttps_epi64(__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvttps2qq128_mask((__v4sf)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvttps_epi64(__mmask8 __U, __m128 __A) { +__funline __m128i _mm_maskz_cvttps_epi64(__mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvttps2qq128_mask( (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvttps_epu64(__m128 __A) { +__funline __m256i _mm256_cvttps_epu64(__m128 __A) { return (__m256i)__builtin_ia32_cvttps2uqq256_mask( (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvttps_epu64(__m256i __W, __mmask8 __U, __m128 __A) { +__funline __m256i _mm256_mask_cvttps_epu64(__m256i __W, __mmask8 __U, + __m128 __A) { return (__m256i)__builtin_ia32_cvttps2uqq256_mask((__v4sf)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) { +__funline __m256i _mm256_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) { return (__m256i)__builtin_ia32_cvttps2uqq256_mask( (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttps_epu64(__m128 __A) { +__funline __m128i _mm_cvttps_epu64(__m128 __A) { return (__m128i)__builtin_ia32_cvttps2uqq128_mask( (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvttps_epu64(__m128i __W, __mmask8 __U, __m128 __A) { +__funline __m128i _mm_mask_cvttps_epu64(__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvttps2uqq128_mask((__v4sf)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) { +__funline __m128i _mm_maskz_cvttps_epu64(__mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvttps2uqq128_mask( (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcast_f64x2(__m128d __A) { +__funline __m256d _mm256_broadcast_f64x2(__m128d __A) { return (__m256d)__builtin_ia32_broadcastf64x2_256_mask( (__v2df)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) { +__funline __m256d _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, + __m128d __A) { return (__m256d)__builtin_ia32_broadcastf64x2_256_mask((__v2df)__A, (__v4df)__O, __M); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { +__funline __m256d _mm256_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_broadcastf64x2_256_mask( (__v2df)__A, (__v4df)_mm256_setzero_ps(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcast_i64x2(__m128i __A) { +__funline __m256i _mm256_broadcast_i64x2(__m128i __A) { return (__m256i)__builtin_ia32_broadcasti64x2_256_mask( (__v2di)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) { +__funline __m256i _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, + __m128i __A) { return (__m256i)__builtin_ia32_broadcasti64x2_256_mask((__v2di)__A, (__v4di)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { +__funline __m256i _mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_broadcasti64x2_256_mask( (__v2di)__A, (__v4di)_mm256_setzero_si256(), __M); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcast_f32x2(__m128 __A) { +__funline __m256 _mm256_broadcast_f32x2(__m128 __A) { return (__m256)__builtin_ia32_broadcastf32x2_256_mask( (__v4sf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) { +__funline __m256 _mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, + __m128 __A) { return (__m256)__builtin_ia32_broadcastf32x2_256_mask((__v4sf)__A, (__v8sf)__O, __M); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) { +__funline __m256 _mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_broadcastf32x2_256_mask( (__v4sf)__A, (__v8sf)_mm256_setzero_ps(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcast_i32x2(__m128i __A) { +__funline __m256i _mm256_broadcast_i32x2(__m128i __A) { return (__m256i)__builtin_ia32_broadcasti32x2_256_mask( (__v4si)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A) { +__funline __m256i _mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, + __m128i __A) { return (__m256i)__builtin_ia32_broadcasti32x2_256_mask((__v4si)__A, (__v8si)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { +__funline __m256i _mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_broadcasti32x2_256_mask( (__v4si)__A, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_broadcast_i32x2(__m128i __A) { +__funline __m128i _mm_broadcast_i32x2(__m128i __A) { return (__m128i)__builtin_ia32_broadcasti32x2_128_mask( (__v4si)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_broadcasti32x2_128_mask((__v4si)__A, (__v4si)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_broadcasti32x2_128_mask( (__v4si)__A, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mullo_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_mullo_epi64(__m256i __A, __m256i __B) { return (__m256i)((__v4du)__A * (__v4du)__B); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmullq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmullq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mullo_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_mullo_epi64(__m128i __A, __m128i __B) { return (__m128i)((__v2du)__A * (__v2du)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmullq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmullq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_andnpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_andnpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_andnpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_andnpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_andnps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_andnps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_andnps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_andnps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtps_epi64(__m128 __A) { +__funline __m256i _mm256_cvtps_epi64(__m128 __A) { return (__m256i)__builtin_ia32_cvtps2qq256_mask( (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtps_epi64(__m256i __W, __mmask8 __U, __m128 __A) { +__funline __m256i _mm256_mask_cvtps_epi64(__m256i __W, __mmask8 __U, __m128 __A) { return (__m256i)__builtin_ia32_cvtps2qq256_mask((__v4sf)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtps_epi64(__mmask8 __U, __m128 __A) { +__funline __m256i _mm256_maskz_cvtps_epi64(__mmask8 __U, __m128 __A) { return (__m256i)__builtin_ia32_cvtps2qq256_mask( (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtps_epi64(__m128 __A) { +__funline __m128i _mm_cvtps_epi64(__m128 __A) { return (__m128i)__builtin_ia32_cvtps2qq128_mask( (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtps_epi64(__m128i __W, __mmask8 __U, __m128 __A) { +__funline __m128i _mm_mask_cvtps_epi64(__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvtps2qq128_mask((__v4sf)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtps_epi64(__mmask8 __U, __m128 __A) { +__funline __m128i _mm_maskz_cvtps_epi64(__mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvtps2qq128_mask( (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtps_epu64(__m128 __A) { +__funline __m256i _mm256_cvtps_epu64(__m128 __A) { return (__m256i)__builtin_ia32_cvtps2uqq256_mask( (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtps_epu64(__m256i __W, __mmask8 __U, __m128 __A) { +__funline __m256i _mm256_mask_cvtps_epu64(__m256i __W, __mmask8 __U, __m128 __A) { return (__m256i)__builtin_ia32_cvtps2uqq256_mask((__v4sf)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) { +__funline __m256i _mm256_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) { return (__m256i)__builtin_ia32_cvtps2uqq256_mask( (__v4sf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtps_epu64(__m128 __A) { +__funline __m128i _mm_cvtps_epu64(__m128 __A) { return (__m128i)__builtin_ia32_cvtps2uqq128_mask( (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtps_epu64(__m128i __W, __mmask8 __U, __m128 __A) { +__funline __m128i _mm_mask_cvtps_epu64(__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvtps2uqq128_mask((__v4sf)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) { +__funline __m128i _mm_maskz_cvtps_epu64(__mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvtps2uqq128_mask( (__v4sf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi64_ps(__m256i __A) { +__funline __m128 _mm256_cvtepi64_ps(__m256i __A) { return (__m128)__builtin_ia32_cvtqq2ps256_mask( (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m256i __A) { +__funline __m128 _mm256_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m256i __A) { return (__m128)__builtin_ia32_cvtqq2ps256_mask((__v4di)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi64_ps(__mmask8 __U, __m256i __A) { +__funline __m128 _mm256_maskz_cvtepi64_ps(__mmask8 __U, __m256i __A) { return (__m128)__builtin_ia32_cvtqq2ps256_mask( (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi64_ps(__m128i __A) { +__funline __m128 _mm_cvtepi64_ps(__m128i __A) { return (__m128)__builtin_ia32_cvtqq2ps128_mask( (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m128i __A) { +__funline __m128 _mm_mask_cvtepi64_ps(__m128 __W, __mmask8 __U, __m128i __A) { return (__m128)__builtin_ia32_cvtqq2ps128_mask((__v2di)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi64_ps(__mmask8 __U, __m128i __A) { +__funline __m128 _mm_maskz_cvtepi64_ps(__mmask8 __U, __m128i __A) { return (__m128)__builtin_ia32_cvtqq2ps128_mask( (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepu64_ps(__m256i __A) { +__funline __m128 _mm256_cvtepu64_ps(__m256i __A) { return (__m128)__builtin_ia32_cvtuqq2ps256_mask( (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m256i __A) { +__funline __m128 _mm256_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m256i __A) { return (__m128)__builtin_ia32_cvtuqq2ps256_mask((__v4di)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepu64_ps(__mmask8 __U, __m256i __A) { +__funline __m128 _mm256_maskz_cvtepu64_ps(__mmask8 __U, __m256i __A) { return (__m128)__builtin_ia32_cvtuqq2ps256_mask( (__v4di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepu64_ps(__m128i __A) { +__funline __m128 _mm_cvtepu64_ps(__m128i __A) { return (__m128)__builtin_ia32_cvtuqq2ps128_mask( (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m128i __A) { +__funline __m128 _mm_mask_cvtepu64_ps(__m128 __W, __mmask8 __U, __m128i __A) { return (__m128)__builtin_ia32_cvtuqq2ps128_mask((__v2di)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepu64_ps(__mmask8 __U, __m128i __A) { +__funline __m128 _mm_maskz_cvtepu64_ps(__mmask8 __U, __m128i __A) { return (__m128)__builtin_ia32_cvtuqq2ps128_mask( (__v2di)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi64_pd(__m256i __A) { +__funline __m256d _mm256_cvtepi64_pd(__m256i __A) { return (__m256d)__builtin_ia32_cvtqq2pd256_mask( (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi64_pd(__m256d __W, __mmask8 __U, __m256i __A) { +__funline __m256d _mm256_mask_cvtepi64_pd(__m256d __W, __mmask8 __U, + __m256i __A) { return (__m256d)__builtin_ia32_cvtqq2pd256_mask((__v4di)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi64_pd(__mmask8 __U, __m256i __A) { +__funline __m256d _mm256_maskz_cvtepi64_pd(__mmask8 __U, __m256i __A) { return (__m256d)__builtin_ia32_cvtqq2pd256_mask( (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi64_pd(__m128i __A) { +__funline __m128d _mm_cvtepi64_pd(__m128i __A) { return (__m128d)__builtin_ia32_cvtqq2pd128_mask( (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi64_pd(__m128d __W, __mmask8 __U, __m128i __A) { +__funline __m128d _mm_mask_cvtepi64_pd(__m128d __W, __mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_cvtqq2pd128_mask((__v2di)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi64_pd(__mmask8 __U, __m128i __A) { +__funline __m128d _mm_maskz_cvtepi64_pd(__mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_cvtqq2pd128_mask( (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepu64_pd(__m256i __A) { +__funline __m256d _mm256_cvtepu64_pd(__m256i __A) { return (__m256d)__builtin_ia32_cvtuqq2pd256_mask( (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepu64_pd(__m256d __W, __mmask8 __U, __m256i __A) { +__funline __m256d _mm256_mask_cvtepu64_pd(__m256d __W, __mmask8 __U, + __m256i __A) { return (__m256d)__builtin_ia32_cvtuqq2pd256_mask((__v4di)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepu64_pd(__mmask8 __U, __m256i __A) { +__funline __m256d _mm256_maskz_cvtepu64_pd(__mmask8 __U, __m256i __A) { return (__m256d)__builtin_ia32_cvtuqq2pd256_mask( (__v4di)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_andpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_andpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_andpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_andpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_andps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_andps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_andps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_andps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepu64_pd(__m128i __A) { +__funline __m128d _mm_cvtepu64_pd(__m128i __A) { return (__m128d)__builtin_ia32_cvtuqq2pd128_mask( (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepu64_pd(__m128d __W, __mmask8 __U, __m128i __A) { +__funline __m128d _mm_mask_cvtepu64_pd(__m128d __W, __mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_cvtuqq2pd128_mask((__v2di)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepu64_pd(__mmask8 __U, __m128i __A) { +__funline __m128d _mm_maskz_cvtepu64_pd(__mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_cvtuqq2pd128_mask( (__v2di)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_xorpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_xorpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_xorpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_xor_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_xor_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_xorpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_xorps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_xorps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_xorps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_xorps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_orpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_orpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_orpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_orpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_orps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_orps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_orps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_orps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movm_epi32(__mmask8 __A) { +__funline __m128i _mm_movm_epi32(__mmask8 __A) { return (__m128i)__builtin_ia32_cvtmask2d128(__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movm_epi32(__mmask8 __A) { +__funline __m256i _mm256_movm_epi32(__mmask8 __A) { return (__m256i)__builtin_ia32_cvtmask2d256(__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movm_epi64(__mmask8 __A) { +__funline __m128i _mm_movm_epi64(__mmask8 __A) { return (__m128i)__builtin_ia32_cvtmask2q128(__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movm_epi64(__mmask8 __A) { +__funline __m256i _mm256_movm_epi64(__mmask8 __A) { return (__m256i)__builtin_ia32_cvtmask2q256(__A); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movepi32_mask(__m128i __A) { +__funline __mmask8 _mm_movepi32_mask(__m128i __A) { return (__mmask8)__builtin_ia32_cvtd2mask128((__v4si)__A); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movepi32_mask(__m256i __A) { +__funline __mmask8 _mm256_movepi32_mask(__m256i __A) { return (__mmask8)__builtin_ia32_cvtd2mask256((__v8si)__A); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movepi64_mask(__m128i __A) { +__funline __mmask8 _mm_movepi64_mask(__m128i __A) { return (__mmask8)__builtin_ia32_cvtq2mask128((__v2di)__A); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movepi64_mask(__m256i __A) { +__funline __mmask8 _mm256_movepi64_mask(__m256i __A) { return (__mmask8)__builtin_ia32_cvtq2mask256((__v4di)__A); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extractf64x2_pd(__m256d __A, const int __imm) { +__funline __m128d _mm256_extractf64x2_pd(__m256d __A, const int __imm) { return (__m128d)__builtin_ia32_extractf64x2_256_mask( (__v4df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A, - const int __imm) { +__funline __m128d _mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, + __m256d __A, const int __imm) { return (__m128d)__builtin_ia32_extractf64x2_256_mask( (__v4df)__A, __imm, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A, const int __imm) { +__funline __m128d _mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A, + const int __imm) { return (__m128d)__builtin_ia32_extractf64x2_256_mask( (__v4df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extracti64x2_epi64(__m256i __A, const int __imm) { +__funline __m128i _mm256_extracti64x2_epi64(__m256i __A, const int __imm) { return (__m128i)__builtin_ia32_extracti64x2_256_mask( (__v4di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A, - const int __imm) { +__funline __m128i _mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, + __m256i __A, const int __imm) { return (__m128i)__builtin_ia32_extracti64x2_256_mask( (__v4di)__A, __imm, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A, const int __imm) { +__funline __m128i _mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A, + const int __imm) { return (__m128i)__builtin_ia32_extracti64x2_256_mask( (__v4di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_reduce_pd(__m256d __A, int __B) { +__funline __m256d _mm256_reduce_pd(__m256d __A, int __B) { return (__m256d)__builtin_ia32_reducepd256_mask( (__v4df)__A, __B, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_reduce_pd(__m256d __W, __mmask8 __U, __m256d __A, int __B) { +__funline __m256d _mm256_mask_reduce_pd(__m256d __W, __mmask8 __U, __m256d __A, + int __B) { return (__m256d)__builtin_ia32_reducepd256_mask((__v4df)__A, __B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_reduce_pd(__mmask8 __U, __m256d __A, int __B) { +__funline __m256d _mm256_maskz_reduce_pd(__mmask8 __U, __m256d __A, int __B) { return (__m256d)__builtin_ia32_reducepd256_mask( (__v4df)__A, __B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_reduce_pd(__m128d __A, int __B) { +__funline __m128d _mm_reduce_pd(__m128d __A, int __B) { return (__m128d)__builtin_ia32_reducepd128_mask( (__v2df)__A, __B, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_reduce_pd(__m128d __W, __mmask8 __U, __m128d __A, int __B) { +__funline __m128d _mm_mask_reduce_pd(__m128d __W, __mmask8 __U, __m128d __A, + int __B) { return (__m128d)__builtin_ia32_reducepd128_mask((__v2df)__A, __B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_reduce_pd(__mmask8 __U, __m128d __A, int __B) { +__funline __m128d _mm_maskz_reduce_pd(__mmask8 __U, __m128d __A, int __B) { return (__m128d)__builtin_ia32_reducepd128_mask( (__v2df)__A, __B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_reduce_ps(__m256 __A, int __B) { +__funline __m256 _mm256_reduce_ps(__m256 __A, int __B) { return (__m256)__builtin_ia32_reduceps256_mask( (__v8sf)__A, __B, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_reduce_ps(__m256 __W, __mmask8 __U, __m256 __A, int __B) { +__funline __m256 _mm256_mask_reduce_ps(__m256 __W, __mmask8 __U, __m256 __A, + int __B) { return (__m256)__builtin_ia32_reduceps256_mask((__v8sf)__A, __B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_reduce_ps(__mmask8 __U, __m256 __A, int __B) { +__funline __m256 _mm256_maskz_reduce_ps(__mmask8 __U, __m256 __A, int __B) { return (__m256)__builtin_ia32_reduceps256_mask( (__v8sf)__A, __B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_reduce_ps(__m128 __A, int __B) { +__funline __m128 _mm_reduce_ps(__m128 __A, int __B) { return (__m128)__builtin_ia32_reduceps128_mask( (__v4sf)__A, __B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_reduce_ps(__m128 __W, __mmask8 __U, __m128 __A, int __B) { +__funline __m128 _mm_mask_reduce_ps(__m128 __W, __mmask8 __U, __m128 __A, + int __B) { return (__m128)__builtin_ia32_reduceps128_mask((__v4sf)__A, __B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_reduce_ps(__mmask8 __U, __m128 __A, int __B) { +__funline __m128 _mm_maskz_reduce_ps(__mmask8 __U, __m128 __A, int __B) { return (__m128)__builtin_ia32_reduceps128_mask( (__v4sf)__A, __B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_range_pd(__m256d __A, __m256d __B, int __C) { +__funline __m256d _mm256_range_pd(__m256d __A, __m256d __B, int __C) { return (__m256d)__builtin_ia32_rangepd256_mask( (__v4df)__A, (__v4df)__B, __C, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_range_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B, - int __C) { +__funline __m256d _mm256_mask_range_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B, int __C) { return (__m256d)__builtin_ia32_rangepd256_mask((__v4df)__A, (__v4df)__B, __C, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_range_pd(__mmask8 __U, __m256d __A, __m256d __B, int __C) { +__funline __m256d _mm256_maskz_range_pd(__mmask8 __U, __m256d __A, __m256d __B, + int __C) { return (__m256d)__builtin_ia32_rangepd256_mask((__v4df)__A, (__v4df)__B, __C, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_range_pd(__m128d __A, __m128d __B, int __C) { +__funline __m128d _mm_range_pd(__m128d __A, __m128d __B, int __C) { return (__m128d)__builtin_ia32_rangepd128_mask( (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_range_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - int __C) { +__funline __m128d _mm_mask_range_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C) { return (__m128d)__builtin_ia32_rangepd128_mask((__v2df)__A, (__v2df)__B, __C, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_range_pd(__mmask8 __U, __m128d __A, __m128d __B, int __C) { +__funline __m128d _mm_maskz_range_pd(__mmask8 __U, __m128d __A, __m128d __B, + int __C) { return (__m128d)__builtin_ia32_rangepd128_mask( (__v2df)__A, (__v2df)__B, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_range_ps(__m256 __A, __m256 __B, int __C) { +__funline __m256 _mm256_range_ps(__m256 __A, __m256 __B, int __C) { return (__m256)__builtin_ia32_rangeps256_mask( (__v8sf)__A, (__v8sf)__B, __C, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_range_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B, - int __C) { +__funline __m256 _mm256_mask_range_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B, int __C) { return (__m256)__builtin_ia32_rangeps256_mask((__v8sf)__A, (__v8sf)__B, __C, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_range_ps(__mmask8 __U, __m256 __A, __m256 __B, int __C) { +__funline __m256 _mm256_maskz_range_ps(__mmask8 __U, __m256 __A, __m256 __B, + int __C) { return (__m256)__builtin_ia32_rangeps256_mask((__v8sf)__A, (__v8sf)__B, __C, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_range_ps(__m128 __A, __m128 __B, int __C) { +__funline __m128 _mm_range_ps(__m128 __A, __m128 __B, int __C) { return (__m128)__builtin_ia32_rangeps128_mask( (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_range_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C) { +__funline __m128 _mm_mask_range_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C) { return (__m128)__builtin_ia32_rangeps128_mask((__v4sf)__A, (__v4sf)__B, __C, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_range_ps(__mmask8 __U, __m128 __A, __m128 __B, int __C) { +__funline __m128 _mm_maskz_range_ps(__mmask8 __U, __m128 __A, __m128 __B, + int __C) { return (__m128)__builtin_ia32_rangeps128_mask( (__v4sf)__A, (__v4sf)__B, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fpclass_pd_mask(__mmask8 __U, __m256d __A, const int __imm) { +__funline __mmask8 _mm256_mask_fpclass_pd_mask(__mmask8 __U, __m256d __A, + const int __imm) { return (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)__A, __imm, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fpclass_pd_mask(__m256d __A, const int __imm) { +__funline __mmask8 _mm256_fpclass_pd_mask(__m256d __A, const int __imm) { return (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)__A, __imm, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fpclass_ps_mask(__mmask8 __U, __m256 __A, const int __imm) { +__funline __mmask8 _mm256_mask_fpclass_ps_mask(__mmask8 __U, __m256 __A, + const int __imm) { return (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)__A, __imm, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fpclass_ps_mask(__m256 __A, const int __imm) { +__funline __mmask8 _mm256_fpclass_ps_mask(__m256 __A, const int __imm) { return (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)__A, __imm, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fpclass_pd_mask(__mmask8 __U, __m128d __A, const int __imm) { +__funline __mmask8 _mm_mask_fpclass_pd_mask(__mmask8 __U, __m128d __A, + const int __imm) { return (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)__A, __imm, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fpclass_pd_mask(__m128d __A, const int __imm) { +__funline __mmask8 _mm_fpclass_pd_mask(__m128d __A, const int __imm) { return (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)__A, __imm, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fpclass_ps_mask(__mmask8 __U, __m128 __A, const int __imm) { +__funline __mmask8 _mm_mask_fpclass_ps_mask(__mmask8 __U, __m128 __A, + const int __imm) { return (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)__A, __imm, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fpclass_ps_mask(__m128 __A, const int __imm) { +__funline __mmask8 _mm_fpclass_ps_mask(__m128 __A, const int __imm) { return (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)__A, __imm, (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_inserti64x2(__m256i __A, __m128i __B, const int __imm) { +__funline __m256i _mm256_inserti64x2(__m256i __A, __m128i __B, const int __imm) { return (__m256i)__builtin_ia32_inserti64x2_256_mask( (__v4di)__A, (__v2di)__B, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_inserti64x2(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B, - const int __imm) { +__funline __m256i _mm256_mask_inserti64x2(__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B, const int __imm) { return (__m256i)__builtin_ia32_inserti64x2_256_mask( (__v4di)__A, (__v2di)__B, __imm, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_inserti64x2(__mmask8 __U, __m256i __A, __m128i __B, - const int __imm) { +__funline __m256i _mm256_maskz_inserti64x2(__mmask8 __U, __m256i __A, __m128i __B, + const int __imm) { return (__m256i)__builtin_ia32_inserti64x2_256_mask( (__v4di)__A, (__v2di)__B, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_insertf64x2(__m256d __A, __m128d __B, const int __imm) { +__funline __m256d _mm256_insertf64x2(__m256d __A, __m128d __B, const int __imm) { return (__m256d)__builtin_ia32_insertf64x2_256_mask( (__v4df)__A, (__v2df)__B, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_insertf64x2(__m256d __W, __mmask8 __U, __m256d __A, __m128d __B, - const int __imm) { +__funline __m256d _mm256_mask_insertf64x2(__m256d __W, __mmask8 __U, __m256d __A, + __m128d __B, const int __imm) { return (__m256d)__builtin_ia32_insertf64x2_256_mask( (__v4df)__A, (__v2df)__B, __imm, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_insertf64x2(__mmask8 __U, __m256d __A, __m128d __B, - const int __imm) { +__funline __m256d _mm256_maskz_insertf64x2(__mmask8 __U, __m256d __A, __m128d __B, + const int __imm) { return (__m256d)__builtin_ia32_insertf64x2_256_mask( (__v4df)__A, (__v2df)__B, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); diff --git a/third_party/intel/avx512vlintrin.internal.h b/third_party/intel/avx512vlintrin.internal.h index c6b4630ac..534d1fdf8 100644 --- a/third_party/intel/avx512vlintrin.internal.h +++ b/third_party/intel/avx512vlintrin.internal.h @@ -13,4523 +13,3476 @@ typedef unsigned int __mmask32; -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_mov_pd(__m256d __W, __mmask8 __U, __m256d __A) { +__funline __m256d _mm256_mask_mov_pd(__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_movapd256_mask((__v4df)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mov_pd(__mmask8 __U, __m256d __A) { +__funline __m256d _mm256_maskz_mov_pd(__mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_movapd256_mask( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { +__funline __m128d _mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_movapd128_mask((__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mov_pd(__mmask8 __U, __m128d __A) { +__funline __m128d _mm_maskz_mov_pd(__mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_movapd128_mask( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_load_pd(__m256d __W, __mmask8 __U, void const *__P) { +__funline __m256d _mm256_mask_load_pd(__m256d __W, __mmask8 __U, + void const *__P) { return (__m256d)__builtin_ia32_loadapd256_mask((__v4df *)__P, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_load_pd(__mmask8 __U, void const *__P) { +__funline __m256d _mm256_maskz_load_pd(__mmask8 __U, void const *__P) { return (__m256d)__builtin_ia32_loadapd256_mask( (__v4df *)__P, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_load_pd(__m128d __W, __mmask8 __U, void const *__P) { +__funline __m128d _mm_mask_load_pd(__m128d __W, __mmask8 __U, void const *__P) { return (__m128d)__builtin_ia32_loadapd128_mask((__v2df *)__P, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_load_pd(__mmask8 __U, void const *__P) { +__funline __m128d _mm_maskz_load_pd(__mmask8 __U, void const *__P) { return (__m128d)__builtin_ia32_loadapd128_mask( (__v2df *)__P, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_store_pd(void *__P, __mmask8 __U, __m256d __A) { +__funline void _mm256_mask_store_pd(void *__P, __mmask8 __U, __m256d __A) { __builtin_ia32_storeapd256_mask((__v4df *)__P, (__v4df)__A, (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_store_pd(void *__P, __mmask8 __U, __m128d __A) { +__funline void _mm_mask_store_pd(void *__P, __mmask8 __U, __m128d __A) { __builtin_ia32_storeapd128_mask((__v2df *)__P, (__v2df)__A, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_mov_ps(__m256 __W, __mmask8 __U, __m256 __A) { +__funline __m256 _mm256_mask_mov_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_movaps256_mask((__v8sf)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mov_ps(__mmask8 __U, __m256 __A) { +__funline __m256 _mm256_maskz_mov_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_movaps256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mov_ps(__m128 __W, __mmask8 __U, __m128 __A) { +__funline __m128 _mm_mask_mov_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_movaps128_mask((__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mov_ps(__mmask8 __U, __m128 __A) { +__funline __m128 _mm_maskz_mov_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_movaps128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_load_ps(__m256 __W, __mmask8 __U, void const *__P) { +__funline __m256 _mm256_mask_load_ps(__m256 __W, __mmask8 __U, void const *__P) { return (__m256)__builtin_ia32_loadaps256_mask((__v8sf *)__P, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_load_ps(__mmask8 __U, void const *__P) { +__funline __m256 _mm256_maskz_load_ps(__mmask8 __U, void const *__P) { return (__m256)__builtin_ia32_loadaps256_mask( (__v8sf *)__P, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_load_ps(__m128 __W, __mmask8 __U, void const *__P) { +__funline __m128 _mm_mask_load_ps(__m128 __W, __mmask8 __U, void const *__P) { return (__m128)__builtin_ia32_loadaps128_mask((__v4sf *)__P, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_load_ps(__mmask8 __U, void const *__P) { +__funline __m128 _mm_maskz_load_ps(__mmask8 __U, void const *__P) { return (__m128)__builtin_ia32_loadaps128_mask( (__v4sf *)__P, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_store_ps(void *__P, __mmask8 __U, __m256 __A) { +__funline void _mm256_mask_store_ps(void *__P, __mmask8 __U, __m256 __A) { __builtin_ia32_storeaps256_mask((__v8sf *)__P, (__v8sf)__A, (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_store_ps(void *__P, __mmask8 __U, __m128 __A) { +__funline void _mm_mask_store_ps(void *__P, __mmask8 __U, __m128 __A) { __builtin_ia32_storeaps128_mask((__v4sf *)__P, (__v4sf)__A, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_mov_epi64(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_mov_epi64(__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_movdqa64_256_mask((__v4di)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mov_epi64(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_mov_epi64(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_movdqa64_256_mask( (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mov_epi64(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_mov_epi64(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_movdqa64_128_mask((__v2di)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mov_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_mov_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_movdqa64_128_mask( (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_load_epi64(__m256i __W, __mmask8 __U, void const *__P) { +__funline __m256i _mm256_mask_load_epi64(__m256i __W, __mmask8 __U, + void const *__P) { return (__m256i)__builtin_ia32_movdqa64load256_mask( (__v4di *)__P, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_load_epi64(__mmask8 __U, void const *__P) { +__funline __m256i _mm256_maskz_load_epi64(__mmask8 __U, void const *__P) { return (__m256i)__builtin_ia32_movdqa64load256_mask( (__v4di *)__P, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_load_epi64(__m128i __W, __mmask8 __U, void const *__P) { +__funline __m128i _mm_mask_load_epi64(__m128i __W, __mmask8 __U, + void const *__P) { return (__m128i)__builtin_ia32_movdqa64load128_mask( (__v2di *)__P, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_load_epi64(__mmask8 __U, void const *__P) { +__funline __m128i _mm_maskz_load_epi64(__mmask8 __U, void const *__P) { return (__m128i)__builtin_ia32_movdqa64load128_mask( (__v2di *)__P, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_store_epi64(void *__P, __mmask8 __U, __m256i __A) { +__funline void _mm256_mask_store_epi64(void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_movdqa64store256_mask((__v4di *)__P, (__v4di)__A, (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_store_epi64(void *__P, __mmask8 __U, __m128i __A) { +__funline void _mm_mask_store_epi64(void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_movdqa64store128_mask((__v2di *)__P, (__v2di)__A, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_mov_epi32(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_mov_epi32(__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_movdqa32_256_mask((__v8si)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mov_epi32(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_mov_epi32(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_movdqa32_256_mask( (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mov_epi32(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_mov_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_movdqa32_128_mask((__v4si)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mov_epi32(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_mov_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_movdqa32_128_mask( (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_load_epi32(__m256i __W, __mmask8 __U, void const *__P) { +__funline __m256i _mm256_mask_load_epi32(__m256i __W, __mmask8 __U, + void const *__P) { return (__m256i)__builtin_ia32_movdqa32load256_mask( (__v8si *)__P, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_load_epi32(__mmask8 __U, void const *__P) { +__funline __m256i _mm256_maskz_load_epi32(__mmask8 __U, void const *__P) { return (__m256i)__builtin_ia32_movdqa32load256_mask( (__v8si *)__P, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_load_epi32(__m128i __W, __mmask8 __U, void const *__P) { +__funline __m128i _mm_mask_load_epi32(__m128i __W, __mmask8 __U, + void const *__P) { return (__m128i)__builtin_ia32_movdqa32load128_mask( (__v4si *)__P, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_load_epi32(__mmask8 __U, void const *__P) { +__funline __m128i _mm_maskz_load_epi32(__mmask8 __U, void const *__P) { return (__m128i)__builtin_ia32_movdqa32load128_mask( (__v4si *)__P, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_store_epi32(void *__P, __mmask8 __U, __m256i __A) { +__funline void _mm256_mask_store_epi32(void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_movdqa32store256_mask((__v8si *)__P, (__v8si)__A, (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_store_epi32(void *__P, __mmask8 __U, __m128i __A) { +__funline void _mm_mask_store_epi32(void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_movdqa32store128_mask((__v4si *)__P, (__v4si)__A, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_addpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_addpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_addpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_addpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_addps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_addps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_addps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_addps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_subpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_subpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_subpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_subpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_subps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_subps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_subps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_subps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_store_epi64(void *__P, __m256i __A) { +__funline void _mm256_store_epi64(void *__P, __m256i __A) { *(__m256i *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_store_epi64(void *__P, __m128i __A) { +__funline void _mm_store_epi64(void *__P, __m128i __A) { *(__m128i *)__P = __A; } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_loadu_pd(__m256d __W, __mmask8 __U, void const *__P) { +__funline __m256d _mm256_mask_loadu_pd(__m256d __W, __mmask8 __U, + void const *__P) { return (__m256d)__builtin_ia32_loadupd256_mask((const double *)__P, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_loadu_pd(__mmask8 __U, void const *__P) { +__funline __m256d _mm256_maskz_loadu_pd(__mmask8 __U, void const *__P) { return (__m256d)__builtin_ia32_loadupd256_mask( (const double *)__P, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_loadu_pd(__m128d __W, __mmask8 __U, void const *__P) { +__funline __m128d _mm_mask_loadu_pd(__m128d __W, __mmask8 __U, void const *__P) { return (__m128d)__builtin_ia32_loadupd128_mask((const double *)__P, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_loadu_pd(__mmask8 __U, void const *__P) { +__funline __m128d _mm_maskz_loadu_pd(__mmask8 __U, void const *__P) { return (__m128d)__builtin_ia32_loadupd128_mask( (const double *)__P, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_storeu_pd(void *__P, __mmask8 __U, __m256d __A) { +__funline void _mm256_mask_storeu_pd(void *__P, __mmask8 __U, __m256d __A) { __builtin_ia32_storeupd256_mask((double *)__P, (__v4df)__A, (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_storeu_pd(void *__P, __mmask8 __U, __m128d __A) { +__funline void _mm_mask_storeu_pd(void *__P, __mmask8 __U, __m128d __A) { __builtin_ia32_storeupd128_mask((double *)__P, (__v2df)__A, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_loadu_ps(__m256 __W, __mmask8 __U, void const *__P) { +__funline __m256 _mm256_mask_loadu_ps(__m256 __W, __mmask8 __U, void const *__P) { return (__m256)__builtin_ia32_loadups256_mask((const float *)__P, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_loadu_ps(__mmask8 __U, void const *__P) { +__funline __m256 _mm256_maskz_loadu_ps(__mmask8 __U, void const *__P) { return (__m256)__builtin_ia32_loadups256_mask( (const float *)__P, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) { +__funline __m128 _mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) { return (__m128)__builtin_ia32_loadups128_mask((const float *)__P, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_loadu_ps(__mmask8 __U, void const *__P) { +__funline __m128 _mm_maskz_loadu_ps(__mmask8 __U, void const *__P) { return (__m128)__builtin_ia32_loadups128_mask( (const float *)__P, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_storeu_ps(void *__P, __mmask8 __U, __m256 __A) { +__funline void _mm256_mask_storeu_ps(void *__P, __mmask8 __U, __m256 __A) { __builtin_ia32_storeups256_mask((float *)__P, (__v8sf)__A, (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_storeu_ps(void *__P, __mmask8 __U, __m128 __A) { +__funline void _mm_mask_storeu_ps(void *__P, __mmask8 __U, __m128 __A) { __builtin_ia32_storeups128_mask((float *)__P, (__v4sf)__A, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_loadu_epi64(__m256i __W, __mmask8 __U, void const *__P) { +__funline __m256i _mm256_mask_loadu_epi64(__m256i __W, __mmask8 __U, + void const *__P) { return (__m256i)__builtin_ia32_loaddqudi256_mask((const long long *)__P, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_loadu_epi64(__mmask8 __U, void const *__P) { +__funline __m256i _mm256_maskz_loadu_epi64(__mmask8 __U, void const *__P) { return (__m256i)__builtin_ia32_loaddqudi256_mask( (const long long *)__P, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_loadu_epi64(__m128i __W, __mmask8 __U, void const *__P) { +__funline __m128i _mm_mask_loadu_epi64(__m128i __W, __mmask8 __U, + void const *__P) { return (__m128i)__builtin_ia32_loaddqudi128_mask((const long long *)__P, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_loadu_epi64(__mmask8 __U, void const *__P) { +__funline __m128i _mm_maskz_loadu_epi64(__mmask8 __U, void const *__P) { return (__m128i)__builtin_ia32_loaddqudi128_mask( (const long long *)__P, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_storeu_epi64(void *__P, __mmask8 __U, __m256i __A) { +__funline void _mm256_mask_storeu_epi64(void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_storedqudi256_mask((long long *)__P, (__v4di)__A, (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_storeu_epi64(void *__P, __mmask8 __U, __m128i __A) { +__funline void _mm_mask_storeu_epi64(void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_storedqudi128_mask((long long *)__P, (__v2di)__A, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_loadu_epi32(__m256i __W, __mmask8 __U, void const *__P) { +__funline __m256i _mm256_mask_loadu_epi32(__m256i __W, __mmask8 __U, + void const *__P) { return (__m256i)__builtin_ia32_loaddqusi256_mask((const int *)__P, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_loadu_epi32(__mmask8 __U, void const *__P) { +__funline __m256i _mm256_maskz_loadu_epi32(__mmask8 __U, void const *__P) { return (__m256i)__builtin_ia32_loaddqusi256_mask( (const int *)__P, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_loadu_epi32(__m128i __W, __mmask8 __U, void const *__P) { +__funline __m128i _mm_mask_loadu_epi32(__m128i __W, __mmask8 __U, + void const *__P) { return (__m128i)__builtin_ia32_loaddqusi128_mask((const int *)__P, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_loadu_epi32(__mmask8 __U, void const *__P) { +__funline __m128i _mm_maskz_loadu_epi32(__mmask8 __U, void const *__P) { return (__m128i)__builtin_ia32_loaddqusi128_mask( (const int *)__P, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_storeu_epi32(void *__P, __mmask8 __U, __m256i __A) { +__funline void _mm256_mask_storeu_epi32(void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_storedqusi256_mask((int *)__P, (__v8si)__A, (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_storeu_epi32(void *__P, __mmask8 __U, __m128i __A) { +__funline void _mm_mask_storeu_epi32(void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_storedqusi128_mask((int *)__P, (__v4si)__A, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_pabsd256_mask((__v8si)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_pabsd256_mask( (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pabsd128_mask((__v4si)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pabsd128_mask( (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_abs_epi64(__m256i __A) { +__funline __m256i _mm256_abs_epi64(__m256i __A) { return (__m256i)__builtin_ia32_pabsq256_mask( (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_abs_epi64(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_abs_epi64(__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_pabsq256_mask((__v4di)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_abs_epi64(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_abs_epi64(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_pabsq256_mask( (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_abs_epi64(__m128i __A) { +__funline __m128i _mm_abs_epi64(__m128i __A) { return (__m128i)__builtin_ia32_pabsq128_mask( (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_abs_epi64(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_abs_epi64(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pabsq128_mask((__v2di)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_abs_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_abs_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pabsq128_mask( (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtpd_epu32(__m256d __A) { +__funline __m128i _mm256_cvtpd_epu32(__m256d __A) { return (__m128i)__builtin_ia32_cvtpd2udq256_mask( (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtpd_epu32(__m128i __W, __mmask8 __U, __m256d __A) { +__funline __m128i _mm256_mask_cvtpd_epu32(__m128i __W, __mmask8 __U, + __m256d __A) { return (__m128i)__builtin_ia32_cvtpd2udq256_mask((__v4df)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtpd_epu32(__mmask8 __U, __m256d __A) { +__funline __m128i _mm256_maskz_cvtpd_epu32(__mmask8 __U, __m256d __A) { return (__m128i)__builtin_ia32_cvtpd2udq256_mask( (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpd_epu32(__m128d __A) { +__funline __m128i _mm_cvtpd_epu32(__m128d __A) { return (__m128i)__builtin_ia32_cvtpd2udq128_mask( (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtpd_epu32(__m128i __W, __mmask8 __U, __m128d __A) { +__funline __m128i _mm_mask_cvtpd_epu32(__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvtpd2udq128_mask((__v2df)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtpd_epu32(__mmask8 __U, __m128d __A) { +__funline __m128i _mm_maskz_cvtpd_epu32(__mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvtpd2udq128_mask( (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvttps_epi32(__m256i __W, __mmask8 __U, __m256 __A) { +__funline __m256i _mm256_mask_cvttps_epi32(__m256i __W, __mmask8 __U, + __m256 __A) { return (__m256i)__builtin_ia32_cvttps2dq256_mask((__v8sf)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvttps_epi32(__mmask8 __U, __m256 __A) { +__funline __m256i _mm256_maskz_cvttps_epi32(__mmask8 __U, __m256 __A) { return (__m256i)__builtin_ia32_cvttps2dq256_mask( (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvttps_epi32(__m128i __W, __mmask8 __U, __m128 __A) { +__funline __m128i _mm_mask_cvttps_epi32(__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvttps2dq128_mask((__v4sf)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvttps_epi32(__mmask8 __U, __m128 __A) { +__funline __m128i _mm_maskz_cvttps_epi32(__mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvttps2dq128_mask( (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvttps_epu32(__m256 __A) { +__funline __m256i _mm256_cvttps_epu32(__m256 __A) { return (__m256i)__builtin_ia32_cvttps2udq256_mask( (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvttps_epu32(__m256i __W, __mmask8 __U, __m256 __A) { +__funline __m256i _mm256_mask_cvttps_epu32(__m256i __W, __mmask8 __U, + __m256 __A) { return (__m256i)__builtin_ia32_cvttps2udq256_mask((__v8sf)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvttps_epu32(__mmask8 __U, __m256 __A) { +__funline __m256i _mm256_maskz_cvttps_epu32(__mmask8 __U, __m256 __A) { return (__m256i)__builtin_ia32_cvttps2udq256_mask( (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttps_epu32(__m128 __A) { +__funline __m128i _mm_cvttps_epu32(__m128 __A) { return (__m128i)__builtin_ia32_cvttps2udq128_mask( (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvttps_epu32(__m128i __W, __mmask8 __U, __m128 __A) { +__funline __m128i _mm_mask_cvttps_epu32(__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvttps2udq128_mask((__v4sf)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvttps_epu32(__mmask8 __U, __m128 __A) { +__funline __m128i _mm_maskz_cvttps_epu32(__mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvttps2udq128_mask( (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvttpd_epi32(__m128i __W, __mmask8 __U, __m256d __A) { +__funline __m128i _mm256_mask_cvttpd_epi32(__m128i __W, __mmask8 __U, + __m256d __A) { return (__m128i)__builtin_ia32_cvttpd2dq256_mask((__v4df)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvttpd_epi32(__mmask8 __U, __m256d __A) { +__funline __m128i _mm256_maskz_cvttpd_epi32(__mmask8 __U, __m256d __A) { return (__m128i)__builtin_ia32_cvttpd2dq256_mask( (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvttpd_epi32(__m128i __W, __mmask8 __U, __m128d __A) { +__funline __m128i _mm_mask_cvttpd_epi32(__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvttpd2dq128_mask((__v2df)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvttpd_epi32(__mmask8 __U, __m128d __A) { +__funline __m128i _mm_maskz_cvttpd_epi32(__mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvttpd2dq128_mask( (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvttpd_epu32(__m256d __A) { +__funline __m128i _mm256_cvttpd_epu32(__m256d __A) { return (__m128i)__builtin_ia32_cvttpd2udq256_mask( (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvttpd_epu32(__m128i __W, __mmask8 __U, __m256d __A) { +__funline __m128i _mm256_mask_cvttpd_epu32(__m128i __W, __mmask8 __U, + __m256d __A) { return (__m128i)__builtin_ia32_cvttpd2udq256_mask((__v4df)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvttpd_epu32(__mmask8 __U, __m256d __A) { +__funline __m128i _mm256_maskz_cvttpd_epu32(__mmask8 __U, __m256d __A) { return (__m128i)__builtin_ia32_cvttpd2udq256_mask( (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttpd_epu32(__m128d __A) { +__funline __m128i _mm_cvttpd_epu32(__m128d __A) { return (__m128i)__builtin_ia32_cvttpd2udq128_mask( (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvttpd_epu32(__m128i __W, __mmask8 __U, __m128d __A) { +__funline __m128i _mm_mask_cvttpd_epu32(__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvttpd2udq128_mask((__v2df)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvttpd_epu32(__mmask8 __U, __m128d __A) { +__funline __m128i _mm_maskz_cvttpd_epu32(__mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvttpd2udq128_mask( (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtpd_epi32(__m128i __W, __mmask8 __U, __m256d __A) { +__funline __m128i _mm256_mask_cvtpd_epi32(__m128i __W, __mmask8 __U, + __m256d __A) { return (__m128i)__builtin_ia32_cvtpd2dq256_mask((__v4df)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtpd_epi32(__mmask8 __U, __m256d __A) { +__funline __m128i _mm256_maskz_cvtpd_epi32(__mmask8 __U, __m256d __A) { return (__m128i)__builtin_ia32_cvtpd2dq256_mask( (__v4df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtpd_epi32(__m128i __W, __mmask8 __U, __m128d __A) { +__funline __m128i _mm_mask_cvtpd_epi32(__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvtpd2dq128_mask((__v2df)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtpd_epi32(__mmask8 __U, __m128d __A) { +__funline __m128i _mm_maskz_cvtpd_epi32(__mmask8 __U, __m128d __A) { return (__m128i)__builtin_ia32_cvtpd2dq128_mask( (__v2df)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi32_pd(__m256d __W, __mmask8 __U, __m128i __A) { +__funline __m256d _mm256_mask_cvtepi32_pd(__m256d __W, __mmask8 __U, + __m128i __A) { return (__m256d)__builtin_ia32_cvtdq2pd256_mask((__v4si)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) { +__funline __m256d _mm256_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) { return (__m256d)__builtin_ia32_cvtdq2pd256_mask( (__v4si)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi32_pd(__m128d __W, __mmask8 __U, __m128i __A) { +__funline __m128d _mm_mask_cvtepi32_pd(__m128d __W, __mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_cvtdq2pd128_mask((__v4si)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) { +__funline __m128d _mm_maskz_cvtepi32_pd(__mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_cvtdq2pd128_mask( (__v4si)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepu32_pd(__m128i __A) { +__funline __m256d _mm256_cvtepu32_pd(__m128i __A) { return (__m256d)__builtin_ia32_cvtudq2pd256_mask( (__v4si)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepu32_pd(__m256d __W, __mmask8 __U, __m128i __A) { +__funline __m256d _mm256_mask_cvtepu32_pd(__m256d __W, __mmask8 __U, + __m128i __A) { return (__m256d)__builtin_ia32_cvtudq2pd256_mask((__v4si)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) { +__funline __m256d _mm256_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) { return (__m256d)__builtin_ia32_cvtudq2pd256_mask( (__v4si)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepu32_pd(__m128i __A) { +__funline __m128d _mm_cvtepu32_pd(__m128i __A) { return (__m128d)__builtin_ia32_cvtudq2pd128_mask( (__v4si)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepu32_pd(__m128d __W, __mmask8 __U, __m128i __A) { +__funline __m128d _mm_mask_cvtepu32_pd(__m128d __W, __mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_cvtudq2pd128_mask((__v4si)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) { +__funline __m128d _mm_maskz_cvtepu32_pd(__mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_cvtudq2pd128_mask( (__v4si)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi32_ps(__m256 __W, __mmask8 __U, __m256i __A) { +__funline __m256 _mm256_mask_cvtepi32_ps(__m256 __W, __mmask8 __U, __m256i __A) { return (__m256)__builtin_ia32_cvtdq2ps256_mask((__v8si)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi32_ps(__mmask8 __U, __m256i __A) { +__funline __m256 _mm256_maskz_cvtepi32_ps(__mmask8 __U, __m256i __A) { return (__m256)__builtin_ia32_cvtdq2ps256_mask( (__v8si)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi32_ps(__m128 __W, __mmask8 __U, __m128i __A) { +__funline __m128 _mm_mask_cvtepi32_ps(__m128 __W, __mmask8 __U, __m128i __A) { return (__m128)__builtin_ia32_cvtdq2ps128_mask((__v4si)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi32_ps(__mmask8 __U, __m128i __A) { +__funline __m128 _mm_maskz_cvtepi32_ps(__mmask8 __U, __m128i __A) { return (__m128)__builtin_ia32_cvtdq2ps128_mask( (__v4si)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepu32_ps(__m256i __A) { +__funline __m256 _mm256_cvtepu32_ps(__m256i __A) { return (__m256)__builtin_ia32_cvtudq2ps256_mask( (__v8si)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepu32_ps(__m256 __W, __mmask8 __U, __m256i __A) { +__funline __m256 _mm256_mask_cvtepu32_ps(__m256 __W, __mmask8 __U, __m256i __A) { return (__m256)__builtin_ia32_cvtudq2ps256_mask((__v8si)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepu32_ps(__mmask8 __U, __m256i __A) { +__funline __m256 _mm256_maskz_cvtepu32_ps(__mmask8 __U, __m256i __A) { return (__m256)__builtin_ia32_cvtudq2ps256_mask( (__v8si)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepu32_ps(__m128i __A) { +__funline __m128 _mm_cvtepu32_ps(__m128i __A) { return (__m128)__builtin_ia32_cvtudq2ps128_mask( (__v4si)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepu32_ps(__m128 __W, __mmask8 __U, __m128i __A) { +__funline __m128 _mm_mask_cvtepu32_ps(__m128 __W, __mmask8 __U, __m128i __A) { return (__m128)__builtin_ia32_cvtudq2ps128_mask((__v4si)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepu32_ps(__mmask8 __U, __m128i __A) { +__funline __m128 _mm_maskz_cvtepu32_ps(__mmask8 __U, __m128i __A) { return (__m128)__builtin_ia32_cvtudq2ps128_mask( (__v4si)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtps_pd(__m256d __W, __mmask8 __U, __m128 __A) { +__funline __m256d _mm256_mask_cvtps_pd(__m256d __W, __mmask8 __U, __m128 __A) { return (__m256d)__builtin_ia32_cvtps2pd256_mask((__v4sf)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtps_pd(__mmask8 __U, __m128 __A) { +__funline __m256d _mm256_maskz_cvtps_pd(__mmask8 __U, __m128 __A) { return (__m256d)__builtin_ia32_cvtps2pd256_mask( (__v4sf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtps_pd(__m128d __W, __mmask8 __U, __m128 __A) { +__funline __m128d _mm_mask_cvtps_pd(__m128d __W, __mmask8 __U, __m128 __A) { return (__m128d)__builtin_ia32_cvtps2pd128_mask((__v4sf)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtps_pd(__mmask8 __U, __m128 __A) { +__funline __m128d _mm_maskz_cvtps_pd(__mmask8 __U, __m128 __A) { return (__m128d)__builtin_ia32_cvtps2pd128_mask( (__v4sf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi32_epi8(__m128i __A) { +__funline __m128i _mm_cvtepi32_epi8(__m128i __A) { return (__m128i)__builtin_ia32_pmovdb128_mask( (__v4si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi32_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtepi32_storeu_epi8(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovdb128mem_mask((__v16qi *)__P, (__v4si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovdb128_mask((__v4si)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi32_epi8(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi32_epi8(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovdb128_mask( (__v4si)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi32_epi8(__m256i __A) { +__funline __m128i _mm256_cvtepi32_epi8(__m256i __A) { return (__m128i)__builtin_ia32_pmovdb256_mask( (__v8si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovdb256_mask((__v8si)__A, (__v16qi)__O, __M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi32_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtepi32_storeu_epi8(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovdb256mem_mask((__v16qi *)__P, (__v8si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi32_epi8(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtepi32_epi8(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovdb256_mask( (__v8si)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsepi32_epi8(__m128i __A) { +__funline __m128i _mm_cvtsepi32_epi8(__m128i __A) { return (__m128i)__builtin_ia32_pmovsdb128_mask( (__v4si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi32_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtsepi32_storeu_epi8(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovsdb128mem_mask((__v16qi *)__P, (__v4si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovsdb128_mask((__v4si)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtsepi32_epi8(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtsepi32_epi8(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovsdb128_mask( (__v4si)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtsepi32_epi8(__m256i __A) { +__funline __m128i _mm256_cvtsepi32_epi8(__m256i __A) { return (__m128i)__builtin_ia32_pmovsdb256_mask( (__v8si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi32_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtsepi32_storeu_epi8(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovsdb256mem_mask((__v16qi *)__P, (__v8si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovsdb256_mask((__v8si)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtsepi32_epi8(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtsepi32_epi8(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovsdb256_mask( (__v8si)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtusepi32_epi8(__m128i __A) { +__funline __m128i _mm_cvtusepi32_epi8(__m128i __A) { return (__m128i)__builtin_ia32_pmovusdb128_mask( (__v4si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi32_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtusepi32_storeu_epi8(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovusdb128mem_mask((__v16qi *)__P, (__v4si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovusdb128_mask((__v4si)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtusepi32_epi8(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtusepi32_epi8(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovusdb128_mask( (__v4si)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtusepi32_epi8(__m256i __A) { +__funline __m128i _mm256_cvtusepi32_epi8(__m256i __A) { return (__m128i)__builtin_ia32_pmovusdb256_mask( (__v8si)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi32_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtusepi32_storeu_epi8(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovusdb256mem_mask((__v16qi *)__P, (__v8si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovusdb256_mask((__v8si)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtusepi32_epi8(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtusepi32_epi8(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovusdb256_mask( (__v8si)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi32_epi16(__m128i __A) { +__funline __m128i _mm_cvtepi32_epi16(__m128i __A) { return (__m128i)__builtin_ia32_pmovdw128_mask( (__v4si)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi32_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtepi32_storeu_epi16(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovdw128mem_mask((__v8hi *)__P, (__v4si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovdw128_mask((__v4si)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi32_epi16(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi32_epi16(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovdw128_mask( (__v4si)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi32_epi16(__m256i __A) { +__funline __m128i _mm256_cvtepi32_epi16(__m256i __A) { return (__m128i)__builtin_ia32_pmovdw256_mask( (__v8si)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi32_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtepi32_storeu_epi16(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovdw256mem_mask((__v8hi *)__P, (__v8si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovdw256_mask((__v8si)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi32_epi16(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtepi32_epi16(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovdw256_mask( (__v8si)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsepi32_epi16(__m128i __A) { +__funline __m128i _mm_cvtsepi32_epi16(__m128i __A) { return (__m128i)__builtin_ia32_pmovsdw128_mask( (__v4si)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi32_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtsepi32_storeu_epi16(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovsdw128mem_mask((__v8hi *)__P, (__v4si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovsdw128_mask((__v4si)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtsepi32_epi16(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtsepi32_epi16(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovsdw128_mask( (__v4si)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtsepi32_epi16(__m256i __A) { +__funline __m128i _mm256_cvtsepi32_epi16(__m256i __A) { return (__m128i)__builtin_ia32_pmovsdw256_mask( (__v8si)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi32_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtsepi32_storeu_epi16(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovsdw256mem_mask((__v8hi *)__P, (__v8si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovsdw256_mask((__v8si)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtsepi32_epi16(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtsepi32_epi16(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovsdw256_mask( (__v8si)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtusepi32_epi16(__m128i __A) { +__funline __m128i _mm_cvtusepi32_epi16(__m128i __A) { return (__m128i)__builtin_ia32_pmovusdw128_mask( (__v4si)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi32_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtusepi32_storeu_epi16(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovusdw128mem_mask((__v8hi *)__P, (__v4si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovusdw128_mask((__v4si)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtusepi32_epi16(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtusepi32_epi16(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovusdw128_mask( (__v4si)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtusepi32_epi16(__m256i __A) { +__funline __m128i _mm256_cvtusepi32_epi16(__m256i __A) { return (__m128i)__builtin_ia32_pmovusdw256_mask( (__v8si)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi32_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtusepi32_storeu_epi16(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovusdw256mem_mask((__v8hi *)__P, (__v8si)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovusdw256_mask((__v8si)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtusepi32_epi16(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtusepi32_epi16(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovusdw256_mask( (__v8si)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi64_epi8(__m128i __A) { +__funline __m128i _mm_cvtepi64_epi8(__m128i __A) { return (__m128i)__builtin_ia32_pmovqb128_mask( (__v2di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovqb128mem_mask((__v16qi *)__P, (__v2di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovqb128_mask((__v2di)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi64_epi8(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi64_epi8(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovqb128_mask( (__v2di)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi64_epi8(__m256i __A) { +__funline __m128i _mm256_cvtepi64_epi8(__m256i __A) { return (__m128i)__builtin_ia32_pmovqb256_mask( (__v4di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtepi64_storeu_epi8(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovqb256mem_mask((__v16qi *)__P, (__v4di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovqb256_mask((__v4di)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi64_epi8(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtepi64_epi8(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovqb256_mask( (__v4di)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsepi64_epi8(__m128i __A) { +__funline __m128i _mm_cvtsepi64_epi8(__m128i __A) { return (__m128i)__builtin_ia32_pmovsqb128_mask( (__v2di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovsqb128mem_mask((__v16qi *)__P, (__v2di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovsqb128_mask((__v2di)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtsepi64_epi8(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtsepi64_epi8(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovsqb128_mask( (__v2di)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtsepi64_epi8(__m256i __A) { +__funline __m128i _mm256_cvtsepi64_epi8(__m256i __A) { return (__m128i)__builtin_ia32_pmovsqb256_mask( (__v4di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtsepi64_storeu_epi8(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovsqb256mem_mask((__v16qi *)__P, (__v4di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovsqb256_mask((__v4di)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtsepi64_epi8(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtsepi64_epi8(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovsqb256_mask( (__v4di)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtusepi64_epi8(__m128i __A) { +__funline __m128i _mm_cvtusepi64_epi8(__m128i __A) { return (__m128i)__builtin_ia32_pmovusqb128_mask( (__v2di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovusqb128mem_mask((__v16qi *)__P, (__v2di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovusqb128_mask((__v2di)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtusepi64_epi8(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtusepi64_epi8(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovusqb128_mask( (__v2di)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtusepi64_epi8(__m256i __A) { +__funline __m128i _mm256_cvtusepi64_epi8(__m256i __A) { return (__m128i)__builtin_ia32_pmovusqb256_mask( (__v4di)__A, (__v16qi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtusepi64_storeu_epi8(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovusqb256mem_mask((__v16qi *)__P, (__v4di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovusqb256_mask((__v4di)__A, (__v16qi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtusepi64_epi8(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtusepi64_epi8(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovusqb256_mask( (__v4di)__A, (__v16qi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi64_epi16(__m128i __A) { +__funline __m128i _mm_cvtepi64_epi16(__m128i __A) { return (__m128i)__builtin_ia32_pmovqw128_mask( (__v2di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovqw128mem_mask((__v8hi *)__P, (__v2di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovqw128_mask((__v2di)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi64_epi16(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi64_epi16(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovqw128_mask( (__v2di)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi64_epi16(__m256i __A) { +__funline __m128i _mm256_cvtepi64_epi16(__m256i __A) { return (__m128i)__builtin_ia32_pmovqw256_mask( (__v4di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovqw256mem_mask((__v8hi *)__P, (__v4di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovqw256_mask((__v4di)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi64_epi16(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtepi64_epi16(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovqw256_mask( (__v4di)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsepi64_epi16(__m128i __A) { +__funline __m128i _mm_cvtsepi64_epi16(__m128i __A) { return (__m128i)__builtin_ia32_pmovsqw128_mask( (__v2di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovsqw128mem_mask((__v8hi *)__P, (__v2di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovsqw128_mask((__v2di)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtsepi64_epi16(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtsepi64_epi16(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovsqw128_mask( (__v2di)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtsepi64_epi16(__m256i __A) { +__funline __m128i _mm256_cvtsepi64_epi16(__m256i __A) { return (__m128i)__builtin_ia32_pmovsqw256_mask( (__v4di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtsepi64_storeu_epi16(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovsqw256mem_mask((__v8hi *)__P, (__v4di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovsqw256_mask((__v4di)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtsepi64_epi16(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtsepi64_epi16(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovsqw256_mask( (__v4di)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtusepi64_epi16(__m128i __A) { +__funline __m128i _mm_cvtusepi64_epi16(__m128i __A) { return (__m128i)__builtin_ia32_pmovusqw128_mask( (__v2di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovusqw128mem_mask((__v8hi *)__P, (__v2di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovusqw128_mask((__v2di)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtusepi64_epi16(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtusepi64_epi16(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovusqw128_mask( (__v2di)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtusepi64_epi16(__m256i __A) { +__funline __m128i _mm256_cvtusepi64_epi16(__m256i __A) { return (__m128i)__builtin_ia32_pmovusqw256_mask( (__v4di)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovusqw256mem_mask((__v8hi *)__P, (__v4di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovusqw256_mask((__v4di)__A, (__v8hi)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtusepi64_epi16(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtusepi64_epi16(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovusqw256_mask( (__v4di)__A, (__v8hi)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi64_epi32(__m128i __A) { +__funline __m128i _mm_cvtepi64_epi32(__m128i __A) { return (__m128i)__builtin_ia32_pmovqd128_mask( (__v2di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovqd128mem_mask((__v4si *)__P, (__v2di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovqd128_mask((__v2di)__A, (__v4si)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi64_epi32(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi64_epi32(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovqd128_mask( (__v2di)__A, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi64_epi32(__m256i __A) { +__funline __m128i _mm256_cvtepi64_epi32(__m256i __A) { return (__m128i)__builtin_ia32_pmovqd256_mask( (__v4di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtepi64_storeu_epi32(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovqd256mem_mask((__v4si *)__P, (__v4di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovqd256_mask((__v4di)__A, (__v4si)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovqd256_mask( (__v4di)__A, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsepi64_epi32(__m128i __A) { +__funline __m128i _mm_cvtsepi64_epi32(__m128i __A) { return (__m128i)__builtin_ia32_pmovsqd128_mask( (__v2di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovsqd128mem_mask((__v4si *)__P, (__v2di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovsqd128_mask((__v2di)__A, (__v4si)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtsepi64_epi32(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtsepi64_epi32(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovsqd128_mask( (__v2di)__A, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtsepi64_epi32(__m256i __A) { +__funline __m128i _mm256_cvtsepi64_epi32(__m256i __A) { return (__m128i)__builtin_ia32_pmovsqd256_mask( (__v4di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovsqd256mem_mask((__v4si *)__P, (__v4di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovsqd256_mask((__v4di)__A, (__v4si)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtsepi64_epi32(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtsepi64_epi32(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovsqd256_mask( (__v4di)__A, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtusepi64_epi32(__m128i __A) { +__funline __m128i _mm_cvtusepi64_epi32(__m128i __A) { return (__m128i)__builtin_ia32_pmovusqd128_mask( (__v2di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, __m128i __A) { +__funline void _mm_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, + __m128i __A) { __builtin_ia32_pmovusqd128mem_mask((__v4si *)__P, (__v2di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pmovusqd128_mask((__v2di)__A, (__v4si)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtusepi64_epi32(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_cvtusepi64_epi32(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pmovusqd128_mask( (__v2di)__A, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtusepi64_epi32(__m256i __A) { +__funline __m128i _mm256_cvtusepi64_epi32(__m256i __A) { return (__m128i)__builtin_ia32_pmovusqd256_mask( (__v4di)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, __m256i __A) { +__funline void _mm256_mask_cvtusepi64_storeu_epi32(void *__P, __mmask8 __M, + __m256i __A) { __builtin_ia32_pmovusqd256mem_mask((__v4si *)__P, (__v4di)__A, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) { +__funline __m128i _mm256_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, + __m256i __A) { return (__m128i)__builtin_ia32_pmovusqd256_mask((__v4di)__A, (__v4si)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtusepi64_epi32(__mmask8 __M, __m256i __A) { +__funline __m128i _mm256_maskz_cvtusepi64_epi32(__mmask8 __M, __m256i __A) { return (__m128i)__builtin_ia32_pmovusqd256_mask( (__v4di)__A, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcastss_ps(__m256 __O, __mmask8 __M, __m128 __A) { +__funline __m256 _mm256_mask_broadcastss_ps(__m256 __O, __mmask8 __M, + __m128 __A) { return (__m256)__builtin_ia32_broadcastss256_mask((__v4sf)__A, (__v8sf)__O, __M); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { +__funline __m256 _mm256_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_broadcastss256_mask( (__v4sf)__A, (__v8sf)_mm256_setzero_ps(), __M); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_broadcastss_ps(__m128 __O, __mmask8 __M, __m128 __A) { +__funline __m128 _mm_mask_broadcastss_ps(__m128 __O, __mmask8 __M, __m128 __A) { return (__m128)__builtin_ia32_broadcastss128_mask((__v4sf)__A, (__v4sf)__O, __M); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { +__funline __m128 _mm_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) { return (__m128)__builtin_ia32_broadcastss128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), __M); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) { +__funline __m256d _mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, + __m128d __A) { return (__m256d)__builtin_ia32_broadcastsd256_mask((__v2df)__A, (__v4df)__O, __M); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) { +__funline __m256d _mm256_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_broadcastsd256_mask( (__v2df)__A, (__v4df)_mm256_setzero_pd(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcastd_epi32(__m256i __O, __mmask8 __M, __m128i __A) { +__funline __m256i _mm256_mask_broadcastd_epi32(__m256i __O, __mmask8 __M, + __m128i __A) { return (__m256i)__builtin_ia32_pbroadcastd256_mask((__v4si)__A, (__v8si)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) { +__funline __m256i _mm256_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_pbroadcastd256_mask( (__v4si)__A, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) { +__funline __m256i _mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) { return (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask(__A, (__v8si)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_set1_epi32(__mmask8 __M, int __A) { +__funline __m256i _mm256_maskz_set1_epi32(__mmask8 __M, int __A) { return (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask( __A, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_broadcastd_epi32(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_broadcastd_epi32(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pbroadcastd128_mask((__v4si)__A, (__v4si)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pbroadcastd128_mask( (__v4si)__A, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) { +__funline __m128i _mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) { return (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask(__A, (__v4si)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_set1_epi32(__mmask8 __M, int __A) { +__funline __m128i _mm_maskz_set1_epi32(__mmask8 __M, int __A) { return (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask( __A, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcastq_epi64(__m256i __O, __mmask8 __M, __m128i __A) { +__funline __m256i _mm256_mask_broadcastq_epi64(__m256i __O, __mmask8 __M, + __m128i __A) { return (__m256i)__builtin_ia32_pbroadcastq256_mask((__v2di)__A, (__v4di)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { +__funline __m256i _mm256_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_pbroadcastq256_mask( (__v2di)__A, (__v4di)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) { +__funline __m256i _mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, + long long __A) { return (__m256i)__builtin_ia32_pbroadcastq256_gpr_mask(__A, (__v4di)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_set1_epi64(__mmask8 __M, long long __A) { +__funline __m256i _mm256_maskz_set1_epi64(__mmask8 __M, long long __A) { return (__m256i)__builtin_ia32_pbroadcastq256_gpr_mask( __A, (__v4di)_mm256_setzero_si256(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_broadcastq_epi64(__m128i __O, __mmask8 __M, __m128i __A) { +__funline __m128i _mm_mask_broadcastq_epi64(__m128i __O, __mmask8 __M, + __m128i __A) { return (__m128i)__builtin_ia32_pbroadcastq128_mask((__v2di)__A, (__v2di)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { +__funline __m128i _mm_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_pbroadcastq128_mask( (__v2di)__A, (__v2di)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { +__funline __m128i _mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) { return (__m128i)__builtin_ia32_pbroadcastq128_gpr_mask(__A, (__v2di)__O, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_set1_epi64(__mmask8 __M, long long __A) { +__funline __m128i _mm_maskz_set1_epi64(__mmask8 __M, long long __A) { return (__m128i)__builtin_ia32_pbroadcastq128_gpr_mask( __A, (__v2di)_mm_setzero_si128(), __M); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcast_f32x4(__m128 __A) { +__funline __m256 _mm256_broadcast_f32x4(__m128 __A) { return (__m256)__builtin_ia32_broadcastf32x4_256_mask( (__v4sf)__A, (__v8sf)_mm256_undefined_pd(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) { +__funline __m256 _mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, + __m128 __A) { return (__m256)__builtin_ia32_broadcastf32x4_256_mask((__v4sf)__A, (__v8sf)__O, __M); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) { +__funline __m256 _mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_broadcastf32x4_256_mask( (__v4sf)__A, (__v8sf)_mm256_setzero_ps(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcast_i32x4(__m128i __A) { +__funline __m256i _mm256_broadcast_i32x4(__m128i __A) { return (__m256i)__builtin_ia32_broadcasti32x4_256_mask( (__v4si)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) { +__funline __m256i _mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, + __m128i __A) { return (__m256i)__builtin_ia32_broadcasti32x4_256_mask((__v4si)__A, (__v8si)__O, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) { +__funline __m256i _mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_broadcasti32x4_256_mask( (__v4si)__A, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi8_epi32(__m256i __W, __mmask8 __U, __m128i __A) { +__funline __m256i _mm256_mask_cvtepi8_epi32(__m256i __W, __mmask8 __U, + __m128i __A) { return (__m256i)__builtin_ia32_pmovsxbd256_mask((__v16qi)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) { +__funline __m256i _mm256_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) { return (__m256i)__builtin_ia32_pmovsxbd256_mask( (__v16qi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovsxbd128_mask((__v16qi)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovsxbd128_mask( (__v16qi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) { +__funline __m256i _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, + __m128i __A) { return (__m256i)__builtin_ia32_pmovsxbq256_mask((__v16qi)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { +__funline __m256i _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { return (__m256i)__builtin_ia32_pmovsxbq256_mask( (__v16qi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovsxbq128_mask((__v16qi)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovsxbq128_mask( (__v16qi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) { +__funline __m256i _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, + __m128i __A) { return (__m256i)__builtin_ia32_pmovsxwd256_mask((__v8hi)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) { +__funline __m256i _mm256_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) { return (__m256i)__builtin_ia32_pmovsxwd256_mask( (__v8hi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, + __m128i __A) { return (__m128i)__builtin_ia32_pmovsxwd128_mask((__v8hi)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovsxwd128_mask( (__v8hi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) { +__funline __m256i _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, + __m128i __A) { return (__m256i)__builtin_ia32_pmovsxwq256_mask((__v8hi)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { +__funline __m256i _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { return (__m256i)__builtin_ia32_pmovsxwq256_mask( (__v8hi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, + __m128i __A) { return (__m128i)__builtin_ia32_pmovsxwq128_mask((__v8hi)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovsxwq128_mask( (__v8hi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) { +__funline __m256i _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, + __m128i __X) { return (__m256i)__builtin_ia32_pmovsxdq256_mask((__v4si)__X, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { +__funline __m256i _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { return (__m256i)__builtin_ia32_pmovsxdq256_mask( (__v4si)__X, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) { +__funline __m128i _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, + __m128i __X) { return (__m128i)__builtin_ia32_pmovsxdq128_mask((__v4si)__X, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { +__funline __m128i _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) { return (__m128i)__builtin_ia32_pmovsxdq128_mask( (__v4si)__X, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) { +__funline __m256i _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, + __m128i __A) { return (__m256i)__builtin_ia32_pmovzxbd256_mask((__v16qi)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { +__funline __m256i _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { return (__m256i)__builtin_ia32_pmovzxbd256_mask( (__v16qi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovzxbd128_mask((__v16qi)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovzxbd128_mask( (__v16qi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) { +__funline __m256i _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, + __m128i __A) { return (__m256i)__builtin_ia32_pmovzxbq256_mask((__v16qi)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { +__funline __m256i _mm256_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { return (__m256i)__builtin_ia32_pmovzxbq256_mask( (__v16qi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovzxbq128_mask((__v16qi)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovzxbq128_mask( (__v16qi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) { +__funline __m256i _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, + __m128i __A) { return (__m256i)__builtin_ia32_pmovzxwd256_mask((__v8hi)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { +__funline __m256i _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { return (__m256i)__builtin_ia32_pmovzxwd256_mask( (__v8hi)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, + __m128i __A) { return (__m128i)__builtin_ia32_pmovzxwd128_mask((__v8hi)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovzxwd128_mask( (__v8hi)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) { +__funline __m256i _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, + __m128i __A) { return (__m256i)__builtin_ia32_pmovzxwq256_mask((__v8hi)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { +__funline __m256i _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { return (__m256i)__builtin_ia32_pmovzxwq256_mask( (__v8hi)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, + __m128i __A) { return (__m128i)__builtin_ia32_pmovzxwq128_mask((__v8hi)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_pmovzxwq128_mask( (__v8hi)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) { +__funline __m256i _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, + __m128i __X) { return (__m256i)__builtin_ia32_pmovzxdq256_mask((__v4si)__X, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { +__funline __m256i _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { return (__m256i)__builtin_ia32_pmovzxdq256_mask( (__v4si)__X, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) { +__funline __m128i _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, + __m128i __X) { return (__m128i)__builtin_ia32_pmovzxdq128_mask((__v4si)__X, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { +__funline __m128i _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) { return (__m128i)__builtin_ia32_pmovzxdq128_mask( (__v4si)__X, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rcp14_pd(__m256d __A) { +__funline __m256d _mm256_rcp14_pd(__m256d __A) { return (__m256d)__builtin_ia32_rcp14pd256_mask( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_rcp14_pd(__m256d __W, __mmask8 __U, __m256d __A) { +__funline __m256d _mm256_mask_rcp14_pd(__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_rcp14pd256_mask((__v4df)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_rcp14_pd(__mmask8 __U, __m256d __A) { +__funline __m256d _mm256_maskz_rcp14_pd(__mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_rcp14pd256_mask( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rcp14_pd(__m128d __A) { +__funline __m128d _mm_rcp14_pd(__m128d __A) { return (__m128d)__builtin_ia32_rcp14pd128_mask( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rcp14_pd(__m128d __W, __mmask8 __U, __m128d __A) { +__funline __m128d _mm_mask_rcp14_pd(__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_rcp14pd128_mask((__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rcp14_pd(__mmask8 __U, __m128d __A) { +__funline __m128d _mm_maskz_rcp14_pd(__mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_rcp14pd128_mask( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rcp14_ps(__m256 __A) { +__funline __m256 _mm256_rcp14_ps(__m256 __A) { return (__m256)__builtin_ia32_rcp14ps256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_rcp14_ps(__m256 __W, __mmask8 __U, __m256 __A) { +__funline __m256 _mm256_mask_rcp14_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_rcp14ps256_mask((__v8sf)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_rcp14_ps(__mmask8 __U, __m256 __A) { +__funline __m256 _mm256_maskz_rcp14_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_rcp14ps256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rcp14_ps(__m128 __A) { +__funline __m128 _mm_rcp14_ps(__m128 __A) { return (__m128)__builtin_ia32_rcp14ps128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rcp14_ps(__m128 __W, __mmask8 __U, __m128 __A) { +__funline __m128 _mm_mask_rcp14_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_rcp14ps128_mask((__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rcp14_ps(__mmask8 __U, __m128 __A) { +__funline __m128 _mm_maskz_rcp14_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_rcp14ps128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rsqrt14_pd(__m256d __A) { +__funline __m256d _mm256_rsqrt14_pd(__m256d __A) { return (__m256d)__builtin_ia32_rsqrt14pd256_mask( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_rsqrt14_pd(__m256d __W, __mmask8 __U, __m256d __A) { +__funline __m256d _mm256_mask_rsqrt14_pd(__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_rsqrt14pd256_mask((__v4df)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_rsqrt14_pd(__mmask8 __U, __m256d __A) { +__funline __m256d _mm256_maskz_rsqrt14_pd(__mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_rsqrt14pd256_mask( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rsqrt14_pd(__m128d __A) { +__funline __m128d _mm_rsqrt14_pd(__m128d __A) { return (__m128d)__builtin_ia32_rsqrt14pd128_mask( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rsqrt14_pd(__m128d __W, __mmask8 __U, __m128d __A) { +__funline __m128d _mm_mask_rsqrt14_pd(__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_rsqrt14pd128_mask((__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rsqrt14_pd(__mmask8 __U, __m128d __A) { +__funline __m128d _mm_maskz_rsqrt14_pd(__mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_rsqrt14pd128_mask( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rsqrt14_ps(__m256 __A) { +__funline __m256 _mm256_rsqrt14_ps(__m256 __A) { return (__m256)__builtin_ia32_rsqrt14ps256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_rsqrt14_ps(__m256 __W, __mmask8 __U, __m256 __A) { +__funline __m256 _mm256_mask_rsqrt14_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_rsqrt14ps256_mask((__v8sf)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_rsqrt14_ps(__mmask8 __U, __m256 __A) { +__funline __m256 _mm256_maskz_rsqrt14_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_rsqrt14ps256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rsqrt14_ps(__m128 __A) { +__funline __m128 _mm_rsqrt14_ps(__m128 __A) { return (__m128)__builtin_ia32_rsqrt14ps128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rsqrt14_ps(__m128 __W, __mmask8 __U, __m128 __A) { +__funline __m128 _mm_mask_rsqrt14_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_rsqrt14ps128_mask((__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rsqrt14_ps(__mmask8 __U, __m128 __A) { +__funline __m128 _mm_maskz_rsqrt14_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_rsqrt14ps128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) { +__funline __m256d _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_sqrtpd256_mask((__v4df)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) { +__funline __m256d _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_sqrtpd256_mask( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) { +__funline __m128d _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_sqrtpd128_mask((__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) { +__funline __m128d _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_sqrtpd128_mask( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) { +__funline __m256 _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_sqrtps256_mask((__v8sf)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) { +__funline __m256 _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_sqrtps256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) { +__funline __m128 _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_sqrtps128_mask((__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) { +__funline __m128 _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_sqrtps128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddd256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_paddd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_paddq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_paddq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubd256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psubd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_psubq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psubq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_paddd128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_paddq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psubd128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psubq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_getexp_ps(__m256 __A) { +__funline __m256 _mm256_getexp_ps(__m256 __A) { return (__m256)__builtin_ia32_getexpps256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_getexp_ps(__m256 __W, __mmask8 __U, __m256 __A) { +__funline __m256 _mm256_mask_getexp_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_getexpps256_mask((__v8sf)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_getexp_ps(__mmask8 __U, __m256 __A) { +__funline __m256 _mm256_maskz_getexp_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_getexpps256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_getexp_pd(__m256d __A) { +__funline __m256d _mm256_getexp_pd(__m256d __A) { return (__m256d)__builtin_ia32_getexppd256_mask( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_getexp_pd(__m256d __W, __mmask8 __U, __m256d __A) { +__funline __m256d _mm256_mask_getexp_pd(__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_getexppd256_mask((__v4df)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_getexp_pd(__mmask8 __U, __m256d __A) { +__funline __m256d _mm256_maskz_getexp_pd(__mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_getexppd256_mask( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getexp_ps(__m128 __A) { +__funline __m128 _mm_getexp_ps(__m128 __A) { return (__m128)__builtin_ia32_getexpps128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getexp_ps(__m128 __W, __mmask8 __U, __m128 __A) { +__funline __m128 _mm_mask_getexp_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_getexpps128_mask((__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getexp_ps(__mmask8 __U, __m128 __A) { +__funline __m128 _mm_maskz_getexp_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_getexpps128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getexp_pd(__m128d __A) { +__funline __m128d _mm_getexp_pd(__m128d __A) { return (__m128d)__builtin_ia32_getexppd128_mask( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getexp_pd(__m128d __W, __mmask8 __U, __m128d __A) { +__funline __m128d _mm_mask_getexp_pd(__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_getexppd128_mask((__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getexp_pd(__mmask8 __U, __m128d __A) { +__funline __m128d _mm_maskz_getexp_pd(__mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_getexppd128_mask( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psrld256_mask((__v8si)__A, (__v4si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psrld256_mask( (__v8si)__A, (__v4si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psrld128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrld128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psrlq256_mask((__v4di)__A, (__v2di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psrlq256_mask( (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psrlq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrlq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pandd256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pandd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_scalef_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_scalef_pd(__m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_scalefpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_scalef_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_scalef_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_scalefpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_scalef_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_scalef_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_scalefpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_scalef_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_scalef_ps(__m256 __A, __m256 __B) { return (__m256)__builtin_ia32_scalefps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_scalef_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_scalef_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_scalefps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_scalef_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_scalef_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_scalefps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_scalef_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_scalef_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_scalefpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_scalef_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_scalef_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_scalefpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_scalef_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_scalef_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_scalefpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_scalef_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_scalef_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_scalefps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_scalef_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_scalef_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_scalefps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_scalef_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_scalef_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_scalefps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { +__funline __m256d _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { +__funline __m256d _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) { return (__m256d)__builtin_ia32_vfmaddpd256_mask3((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256_maskz((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { +__funline __m128d _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { +__funline __m128d _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) { return (__m128d)__builtin_ia32_vfmaddpd128_mask3((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd128_maskz((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { +__funline __m256 _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { +__funline __m256 _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) { return (__m256)__builtin_ia32_vfmaddps256_mask3((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256_maskz((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { +__funline __m128 _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfmaddps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { +__funline __m128 _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) { return (__m128)__builtin_ia32_vfmaddps128_mask3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfmaddps128_maskz((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { +__funline __m256d _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfmsubpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { +__funline __m256d _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) { return (__m256d)__builtin_ia32_vfmsubpd256_mask3((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfmsubpd256_maskz((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { +__funline __m128d _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfmsubpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { +__funline __m128d _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) { return (__m128d)__builtin_ia32_vfmsubpd128_mask3((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfmsubpd128_maskz((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { +__funline __m256 _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfmsubps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { +__funline __m256 _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) { return (__m256)__builtin_ia32_vfmsubps256_mask3((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfmsubps256_maskz((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { +__funline __m128 _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfmsubps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { +__funline __m128 _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) { return (__m128)__builtin_ia32_vfmsubps128_mask3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfmsubps128_maskz((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { +__funline __m256d _mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { +__funline __m256d _mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) { return (__m256d)__builtin_ia32_vfmaddsubpd256_mask3( (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256_maskz( (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { +__funline __m128d _mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { +__funline __m128d _mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) { return (__m128d)__builtin_ia32_vfmaddsubpd128_mask3( (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd128_maskz( (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { +__funline __m256 _mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { +__funline __m256 _mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) { return (__m256)__builtin_ia32_vfmaddsubps256_mask3( (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256_maskz( (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { +__funline __m128 _mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { +__funline __m128 _mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) { return (__m128)__builtin_ia32_vfmaddsubps128_mask3( (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps128_maskz( (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { +__funline __m256d _mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256_mask( (__v4df)__A, (__v4df)__B, -(__v4df)__C, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { +__funline __m256d _mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) { return (__m256d)__builtin_ia32_vfmsubaddpd256_mask3( (__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256_maskz( (__v4df)__A, (__v4df)__B, -(__v4df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { +__funline __m128d _mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd128_mask( (__v2df)__A, (__v2df)__B, -(__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { +__funline __m128d _mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) { return (__m128d)__builtin_ia32_vfmsubaddpd128_mask3( (__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd128_maskz( (__v2df)__A, (__v2df)__B, -(__v2df)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { +__funline __m256 _mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256_mask( (__v8sf)__A, (__v8sf)__B, -(__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { +__funline __m256 _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) { return (__m256)__builtin_ia32_vfmsubaddps256_mask3( (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256_maskz( (__v8sf)__A, (__v8sf)__B, -(__v8sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { +__funline __m128 _mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps128_mask( (__v4sf)__A, (__v4sf)__B, -(__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { +__funline __m128 _mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) { return (__m128)__builtin_ia32_vfmsubaddps128_mask3( (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps128_maskz( (__v4sf)__A, (__v4sf)__B, -(__v4sf)__C, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { +__funline __m256d _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfnmaddpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { +__funline __m256d _mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) { return (__m256d)__builtin_ia32_vfnmaddpd256_mask3((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfnmaddpd256_maskz((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { +__funline __m128d _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfnmaddpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { +__funline __m128d _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) { return (__m128d)__builtin_ia32_vfnmaddpd128_mask3((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfnmaddpd128_maskz((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { +__funline __m256 _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfnmaddps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { +__funline __m256 _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) { return (__m256)__builtin_ia32_vfnmaddps256_mask3((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfnmaddps256_maskz((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { +__funline __m128 _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfnmaddps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { +__funline __m128 _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) { return (__m128)__builtin_ia32_vfnmaddps128_mask3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfnmaddps128_maskz((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { +__funline __m256d _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfnmsubpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { +__funline __m256d _mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) { return (__m256d)__builtin_ia32_vfnmsubpd256_mask3((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) { return (__m256d)__builtin_ia32_vfnmsubpd256_maskz((__v4df)__A, (__v4df)__B, (__v4df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { +__funline __m128d _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfnmsubpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { +__funline __m128d _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) { return (__m128d)__builtin_ia32_vfnmsubpd128_mask3((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) { return (__m128d)__builtin_ia32_vfnmsubpd128_maskz((__v2df)__A, (__v2df)__B, (__v2df)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { +__funline __m256 _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfnmsubps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { +__funline __m256 _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) { return (__m256)__builtin_ia32_vfnmsubps256_mask3((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) { return (__m256)__builtin_ia32_vfnmsubps256_maskz((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { +__funline __m128 _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfnmsubps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { +__funline __m128 _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) { return (__m128)__builtin_ia32_vfnmsubps128_mask3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) { return (__m128)__builtin_ia32_vfnmsubps128_maskz((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pandd128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pandd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pandnd256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pandnd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pandnd128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_andnot_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_andnot_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pandnd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_or_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_or_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pord256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pord256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_or_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_or_epi32(__m256i __A, __m256i __B) { return (__m256i)((__v8su)__A | (__v8su)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pord128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pord128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_or_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_or_epi32(__m128i __A, __m128i __B) { return (__m128i)((__v4su)__A | (__v4su)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pxord256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pxord256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_xor_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_xor_epi32(__m256i __A, __m256i __B) { return (__m256i)((__v8su)__A ^ (__v8su)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pxord128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pxord128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_xor_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_xor_epi32(__m128i __A, __m128i __B) { return (__m128i)((__v4su)__A ^ (__v4su)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m128d __A) { +__funline __m128 _mm_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m128d __A) { return (__m128)__builtin_ia32_cvtpd2ps_mask((__v2df)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtpd_ps(__mmask8 __U, __m128d __A) { +__funline __m128 _mm_maskz_cvtpd_ps(__mmask8 __U, __m128d __A) { return (__m128)__builtin_ia32_cvtpd2ps_mask( (__v2df)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m256d __A) { +__funline __m128 _mm256_mask_cvtpd_ps(__m128 __W, __mmask8 __U, __m256d __A) { return (__m128)__builtin_ia32_cvtpd2ps256_mask((__v4df)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtpd_ps(__mmask8 __U, __m256d __A) { +__funline __m128 _mm256_maskz_cvtpd_ps(__mmask8 __U, __m256d __A) { return (__m128)__builtin_ia32_cvtpd2ps256_mask( (__v4df)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtps_epi32(__m256i __W, __mmask8 __U, __m256 __A) { +__funline __m256i _mm256_mask_cvtps_epi32(__m256i __W, __mmask8 __U, __m256 __A) { return (__m256i)__builtin_ia32_cvtps2dq256_mask((__v8sf)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtps_epi32(__mmask8 __U, __m256 __A) { +__funline __m256i _mm256_maskz_cvtps_epi32(__mmask8 __U, __m256 __A) { return (__m256i)__builtin_ia32_cvtps2dq256_mask( (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtps_epi32(__m128i __W, __mmask8 __U, __m128 __A) { +__funline __m128i _mm_mask_cvtps_epi32(__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvtps2dq128_mask((__v4sf)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtps_epi32(__mmask8 __U, __m128 __A) { +__funline __m128i _mm_maskz_cvtps_epi32(__mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvtps2dq128_mask( (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtps_epu32(__m256 __A) { +__funline __m256i _mm256_cvtps_epu32(__m256 __A) { return (__m256i)__builtin_ia32_cvtps2udq256_mask( (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtps_epu32(__m256i __W, __mmask8 __U, __m256 __A) { +__funline __m256i _mm256_mask_cvtps_epu32(__m256i __W, __mmask8 __U, __m256 __A) { return (__m256i)__builtin_ia32_cvtps2udq256_mask((__v8sf)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtps_epu32(__mmask8 __U, __m256 __A) { +__funline __m256i _mm256_maskz_cvtps_epu32(__mmask8 __U, __m256 __A) { return (__m256i)__builtin_ia32_cvtps2udq256_mask( (__v8sf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtps_epu32(__m128 __A) { +__funline __m128i _mm_cvtps_epu32(__m128 __A) { return (__m128i)__builtin_ia32_cvtps2udq128_mask( (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtps_epu32(__m128i __W, __mmask8 __U, __m128 __A) { +__funline __m128i _mm_mask_cvtps_epu32(__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvtps2udq128_mask((__v4sf)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtps_epu32(__mmask8 __U, __m128 __A) { +__funline __m128i _mm_maskz_cvtps_epu32(__mmask8 __U, __m128 __A) { return (__m128i)__builtin_ia32_cvtps2udq128_mask( (__v4sf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_movedup_pd(__m256d __W, __mmask8 __U, __m256d __A) { +__funline __m256d _mm256_mask_movedup_pd(__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_movddup256_mask((__v4df)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_movedup_pd(__mmask8 __U, __m256d __A) { +__funline __m256d _mm256_maskz_movedup_pd(__mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_movddup256_mask( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_movedup_pd(__m128d __W, __mmask8 __U, __m128d __A) { +__funline __m128d _mm_mask_movedup_pd(__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_movddup128_mask((__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_movedup_pd(__mmask8 __U, __m128d __A) { +__funline __m128d _mm_maskz_movedup_pd(__mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_movddup128_mask( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) { +__funline __m256 _mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_movshdup256_mask((__v8sf)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) { +__funline __m256 _mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_movshdup256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) { +__funline __m128 _mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_movshdup128_mask((__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) { +__funline __m128 _mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_movshdup128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) { +__funline __m256 _mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_movsldup256_mask((__v8sf)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) { +__funline __m256 _mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_movsldup256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) { +__funline __m128 _mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_movsldup128_mask((__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) { +__funline __m128 _mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_movsldup128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpckhdq128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpckhdq128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { +__funline __m256i _mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpckhdq256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_punpckhdq256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpckhqdq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpckhqdq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { +__funline __m256i _mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpckhqdq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_punpckhqdq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpckldq128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpckldq128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { +__funline __m256i _mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpckldq256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_punpckldq256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpcklqdq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_punpcklqdq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) { +__funline __m256i _mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_punpcklqdq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_punpcklqdq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epu32_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpeq_epu32_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 0, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epi32_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpeq_epi32_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__A, (__v4si)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpeq_epu32_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpeq_epu32_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 0, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpeq_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpeq_epi32_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__A, (__v4si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epu32_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_cmpeq_epu32_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 0, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epi32_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_cmpeq_epi32_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__A, (__v8si)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpeq_epu32_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_cmpeq_epu32_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 0, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpeq_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_cmpeq_epi32_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__A, (__v8si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epu64_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpeq_epu64_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 0, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epi64_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpeq_epi64_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__A, (__v2di)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpeq_epu64_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpeq_epu64_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 0, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpeq_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpeq_epi64_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__A, (__v2di)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epu64_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_cmpeq_epu64_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 0, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpeq_epi64_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_cmpeq_epi64_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__A, (__v4di)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpeq_epu64_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_cmpeq_epu64_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 0, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpeq_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_cmpeq_epi64_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__A, (__v4di)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epu32_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpgt_epu32_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 6, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epi32_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpgt_epi32_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__A, (__v4si)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpgt_epu32_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpgt_epu32_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__A, (__v4si)__B, 6, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpgt_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpgt_epi32_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__A, (__v4si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epu32_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_cmpgt_epu32_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 6, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epi32_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_cmpgt_epi32_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__A, (__v8si)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpgt_epu32_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_cmpgt_epu32_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__A, (__v8si)__B, 6, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpgt_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_cmpgt_epi32_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__A, (__v8si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epu64_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpgt_epu64_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 6, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epi64_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_cmpgt_epi64_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__A, (__v2di)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpgt_epu64_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpgt_epu64_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__A, (__v2di)__B, 6, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpgt_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_cmpgt_epi64_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__A, (__v2di)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epu64_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_cmpgt_epu64_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 6, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpgt_epi64_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_cmpgt_epi64_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__A, (__v4di)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpgt_epu64_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_cmpgt_epu64_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__A, (__v4di)__B, 6, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpgt_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_cmpgt_epi64_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__A, (__v4di)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_test_epi32_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_test_epi32_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ptestmd128((__v4si)__A, (__v4si)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_test_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_test_epi32_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ptestmd128((__v4si)__A, (__v4si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_test_epi32_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_test_epi32_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_ptestmd256((__v8si)__A, (__v8si)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_test_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_test_epi32_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_ptestmd256((__v8si)__A, (__v8si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_test_epi64_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_test_epi64_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ptestmq128((__v2di)__A, (__v2di)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_test_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_test_epi64_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ptestmq128((__v2di)__A, (__v2di)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_test_epi64_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_test_epi64_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_ptestmq256((__v4di)__A, (__v4di)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_test_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_test_epi64_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_ptestmq256((__v4di)__A, (__v4di)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testn_epi32_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_testn_epi32_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ptestnmd128((__v4si)__A, (__v4si)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_testn_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_testn_epi32_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ptestnmd128((__v4si)__A, (__v4si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testn_epi32_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_testn_epi32_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_ptestnmd256((__v8si)__A, (__v8si)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_testn_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_testn_epi32_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_ptestnmd256((__v8si)__A, (__v8si)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testn_epi64_mask(__m128i __A, __m128i __B) { +__funline __mmask8 _mm_testn_epi64_mask(__m128i __A, __m128i __B) { return (__mmask8)__builtin_ia32_ptestnmq128((__v2di)__A, (__v2di)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_testn_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __mmask8 _mm_mask_testn_epi64_mask(__mmask8 __U, __m128i __A, + __m128i __B) { return (__mmask8)__builtin_ia32_ptestnmq128((__v2di)__A, (__v2di)__B, __U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testn_epi64_mask(__m256i __A, __m256i __B) { +__funline __mmask8 _mm256_testn_epi64_mask(__m256i __A, __m256i __B) { return (__mmask8)__builtin_ia32_ptestnmq256((__v4di)__A, (__v4di)__B, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_testn_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __mmask8 _mm256_mask_testn_epi64_mask(__mmask8 __U, __m256i __A, + __m256i __B) { return (__mmask8)__builtin_ia32_ptestnmq256((__v4di)__A, (__v4di)__B, __U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compress_pd(__m256d __W, __mmask8 __U, __m256d __A) { +__funline __m256d _mm256_mask_compress_pd(__m256d __W, __mmask8 __U, + __m256d __A) { return (__m256d)__builtin_ia32_compressdf256_mask((__v4df)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_compress_pd(__mmask8 __U, __m256d __A) { +__funline __m256d _mm256_maskz_compress_pd(__mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_compressdf256_mask( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m256d __A) { +__funline void _mm256_mask_compressstoreu_pd(void *__P, __mmask8 __U, + __m256d __A) { __builtin_ia32_compressstoredf256_mask((__v4df *)__P, (__v4df)__A, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compress_pd(__m128d __W, __mmask8 __U, __m128d __A) { +__funline __m128d _mm_mask_compress_pd(__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_compressdf128_mask((__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_compress_pd(__mmask8 __U, __m128d __A) { +__funline __m128d _mm_maskz_compress_pd(__mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_compressdf128_mask( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m128d __A) { +__funline void _mm_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m128d __A) { __builtin_ia32_compressstoredf128_mask((__v2df *)__P, (__v2df)__A, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compress_ps(__m256 __W, __mmask8 __U, __m256 __A) { +__funline __m256 _mm256_mask_compress_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_compresssf256_mask((__v8sf)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_compress_ps(__mmask8 __U, __m256 __A) { +__funline __m256 _mm256_maskz_compress_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_compresssf256_mask( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compressstoreu_ps(void *__P, __mmask8 __U, __m256 __A) { +__funline void _mm256_mask_compressstoreu_ps(void *__P, __mmask8 __U, + __m256 __A) { __builtin_ia32_compressstoresf256_mask((__v8sf *)__P, (__v8sf)__A, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compress_ps(__m128 __W, __mmask8 __U, __m128 __A) { +__funline __m128 _mm_mask_compress_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_compresssf128_mask((__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_compress_ps(__mmask8 __U, __m128 __A) { +__funline __m128 _mm_maskz_compress_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_compresssf128_mask( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compressstoreu_ps(void *__P, __mmask8 __U, __m128 __A) { +__funline void _mm_mask_compressstoreu_ps(void *__P, __mmask8 __U, __m128 __A) { __builtin_ia32_compressstoresf128_mask((__v4sf *)__P, (__v4sf)__A, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compress_epi64(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_compress_epi64(__m256i __W, __mmask8 __U, + __m256i __A) { return (__m256i)__builtin_ia32_compressdi256_mask((__v4di)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_compress_epi64(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_compress_epi64(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_compressdi256_mask( (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m256i __A) { +__funline void _mm256_mask_compressstoreu_epi64(void *__P, __mmask8 __U, + __m256i __A) { __builtin_ia32_compressstoredi256_mask((__v4di *)__P, (__v4di)__A, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compress_epi64(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_compress_epi64(__m128i __W, __mmask8 __U, + __m128i __A) { return (__m128i)__builtin_ia32_compressdi128_mask((__v2di)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_compress_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_compress_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_compressdi128_mask( (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m128i __A) { +__funline void _mm_mask_compressstoreu_epi64(void *__P, __mmask8 __U, + __m128i __A) { __builtin_ia32_compressstoredi128_mask((__v2di *)__P, (__v2di)__A, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compress_epi32(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_compress_epi32(__m256i __W, __mmask8 __U, + __m256i __A) { return (__m256i)__builtin_ia32_compresssi256_mask((__v8si)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_compress_epi32(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_compress_epi32(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_compresssi256_mask( (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_compressstoreu_epi32(void *__P, __mmask8 __U, __m256i __A) { +__funline void _mm256_mask_compressstoreu_epi32(void *__P, __mmask8 __U, + __m256i __A) { __builtin_ia32_compressstoresi256_mask((__v8si *)__P, (__v8si)__A, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compress_epi32(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_compress_epi32(__m128i __W, __mmask8 __U, + __m128i __A) { return (__m128i)__builtin_ia32_compresssi128_mask((__v4si)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_compress_epi32(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_compress_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_compresssi128_mask( (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_compressstoreu_epi32(void *__P, __mmask8 __U, __m128i __A) { +__funline void _mm_mask_compressstoreu_epi32(void *__P, __mmask8 __U, + __m128i __A) { __builtin_ia32_compressstoresi128_mask((__v4si *)__P, (__v4si)__A, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expand_pd(__m256d __W, __mmask8 __U, __m256d __A) { +__funline __m256d _mm256_mask_expand_pd(__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_expanddf256_mask((__v4df)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expand_pd(__mmask8 __U, __m256d __A) { +__funline __m256d _mm256_maskz_expand_pd(__mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_expanddf256_maskz( (__v4df)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expandloadu_pd(__m256d __W, __mmask8 __U, void const *__P) { +__funline __m256d _mm256_mask_expandloadu_pd(__m256d __W, __mmask8 __U, + void const *__P) { return (__m256d)__builtin_ia32_expandloaddf256_mask( (__v4df *)__P, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { +__funline __m256d _mm256_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { return (__m256d)__builtin_ia32_expandloaddf256_maskz( (__v4df *)__P, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) { +__funline __m128d _mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_expanddf128_mask((__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expand_pd(__mmask8 __U, __m128d __A) { +__funline __m128d _mm_maskz_expand_pd(__mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_expanddf128_maskz( (__v2df)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expandloadu_pd(__m128d __W, __mmask8 __U, void const *__P) { +__funline __m128d _mm_mask_expandloadu_pd(__m128d __W, __mmask8 __U, + void const *__P) { return (__m128d)__builtin_ia32_expandloaddf128_mask( (__v2df *)__P, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { +__funline __m128d _mm_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { return (__m128d)__builtin_ia32_expandloaddf128_maskz( (__v2df *)__P, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expand_ps(__m256 __W, __mmask8 __U, __m256 __A) { +__funline __m256 _mm256_mask_expand_ps(__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_expandsf256_mask((__v8sf)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expand_ps(__mmask8 __U, __m256 __A) { +__funline __m256 _mm256_maskz_expand_ps(__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_expandsf256_maskz( (__v8sf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expandloadu_ps(__m256 __W, __mmask8 __U, void const *__P) { +__funline __m256 _mm256_mask_expandloadu_ps(__m256 __W, __mmask8 __U, + void const *__P) { return (__m256)__builtin_ia32_expandloadsf256_mask((__v8sf *)__P, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expandloadu_ps(__mmask8 __U, void const *__P) { +__funline __m256 _mm256_maskz_expandloadu_ps(__mmask8 __U, void const *__P) { return (__m256)__builtin_ia32_expandloadsf256_maskz( (__v8sf *)__P, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expand_ps(__m128 __W, __mmask8 __U, __m128 __A) { +__funline __m128 _mm_mask_expand_ps(__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_expandsf128_mask((__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expand_ps(__mmask8 __U, __m128 __A) { +__funline __m128 _mm_maskz_expand_ps(__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_expandsf128_maskz( (__v4sf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expandloadu_ps(__m128 __W, __mmask8 __U, void const *__P) { +__funline __m128 _mm_mask_expandloadu_ps(__m128 __W, __mmask8 __U, + void const *__P) { return (__m128)__builtin_ia32_expandloadsf128_mask((__v4sf *)__P, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expandloadu_ps(__mmask8 __U, void const *__P) { +__funline __m128 _mm_maskz_expandloadu_ps(__mmask8 __U, void const *__P) { return (__m128)__builtin_ia32_expandloadsf128_maskz( (__v4sf *)__P, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expand_epi64(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_expand_epi64(__m256i __W, __mmask8 __U, + __m256i __A) { return (__m256i)__builtin_ia32_expanddi256_mask((__v4di)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expand_epi64(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_expand_epi64(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_expanddi256_maskz( (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expandloadu_epi64(__m256i __W, __mmask8 __U, void const *__P) { +__funline __m256i _mm256_mask_expandloadu_epi64(__m256i __W, __mmask8 __U, + void const *__P) { return (__m256i)__builtin_ia32_expandloaddi256_mask( (__v4di *)__P, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { +__funline __m256i _mm256_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { return (__m256i)__builtin_ia32_expandloaddi256_maskz( (__v4di *)__P, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expand_epi64(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_expand_epi64(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_expanddi128_mask((__v2di)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expand_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_expand_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_expanddi128_maskz( (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expandloadu_epi64(__m128i __W, __mmask8 __U, void const *__P) { +__funline __m128i _mm_mask_expandloadu_epi64(__m128i __W, __mmask8 __U, + void const *__P) { return (__m128i)__builtin_ia32_expandloaddi128_mask( (__v2di *)__P, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { +__funline __m128i _mm_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { return (__m128i)__builtin_ia32_expandloaddi128_maskz( (__v2di *)__P, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expand_epi32(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_expand_epi32(__m256i __W, __mmask8 __U, + __m256i __A) { return (__m256i)__builtin_ia32_expandsi256_mask((__v8si)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expand_epi32(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_expand_epi32(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_expandsi256_maskz( (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_expandloadu_epi32(__m256i __W, __mmask8 __U, void const *__P) { +__funline __m256i _mm256_mask_expandloadu_epi32(__m256i __W, __mmask8 __U, + void const *__P) { return (__m256i)__builtin_ia32_expandloadsi256_mask( (__v8si *)__P, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) { +__funline __m256i _mm256_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) { return (__m256i)__builtin_ia32_expandloadsi256_maskz( (__v8si *)__P, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expand_epi32(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_expand_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_expandsi128_mask((__v4si)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expand_epi32(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_expand_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_expandsi128_maskz( (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_expandloadu_epi32(__m128i __W, __mmask8 __U, void const *__P) { +__funline __m128i _mm_mask_expandloadu_epi32(__m128i __W, __mmask8 __U, + void const *__P) { return (__m128i)__builtin_ia32_expandloadsi128_mask( (__v4si *)__P, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) { +__funline __m128i _mm_maskz_expandloadu_epi32(__mmask8 __U, void const *__P) { return (__m128i)__builtin_ia32_expandloadsi128_maskz( (__v4si *)__P, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { +__funline __m256d _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_vpermt2varpd256_mask((__v4di)__I /* idx */, (__v4df)__A, (__v4df)__B, (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, - __m256d __B) { +__funline __m256d _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, + __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_vpermt2varpd256_mask((__v4di)__I /* idx */, (__v4df)__A, (__v4df)__B, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U, - __m256d __B) { +__funline __m256d _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, + __mmask8 __U, __m256d __B) { return (__m256d)__builtin_ia32_vpermi2varpd256_mask((__v4df)__A, (__v4di)__I /* idx */, @@ -4537,38 +3490,31 @@ extern __inline __m256d (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, - __m256d __B) { +__funline __m256d _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, + __m256i __I, __m256d __B) { return (__m256d)__builtin_ia32_vpermt2varpd256_maskz((__v4di)__I /* idx */, (__v4df)__A, (__v4df)__B, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { +__funline __m256 _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_vpermt2varps256_mask((__v8si)__I /* idx */, (__v8sf)__A, (__v8sf)__B, (__mmask8)-1); } -extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { +__funline __m256 _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, + __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_vpermt2varps256_mask((__v8si)__I /* idx */, (__v8sf)__A, (__v8sf)__B, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U, - __m256 __B) { +__funline __m256 _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, + __mmask8 __U, __m256 __B) { return (__m256)__builtin_ia32_vpermi2varps256_mask((__v8sf)__A, (__v8si)__I /* idx */, @@ -4576,39 +3522,31 @@ extern __inline __m256 (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, - __m256 __B) { +__funline __m256 _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, + __m256i __I, __m256 __B) { return (__m256)__builtin_ia32_vpermt2varps256_maskz((__v8si)__I /* idx */, (__v8sf)__A, (__v8sf)__B, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { +__funline __m128i _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2varq128_mask((__v2di)__I /* idx */, (__v2di)__A, (__v2di)__B, (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) { +__funline __m128i _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, + __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2varq128_mask((__v2di)__I /* idx */, (__v2di)__A, (__v2di)__B, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) { +__funline __m128i _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, + __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2varq128_mask((__v2di)__A, (__v2di)__I /* idx */, @@ -4616,39 +3554,31 @@ extern __inline __m128i (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) { +__funline __m128i _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, + __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2varq128_maskz((__v2di)__I /* idx */, (__v2di)__A, (__v2di)__B, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { +__funline __m128i _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2vard128_mask((__v4si)__I /* idx */, (__v4si)__A, (__v4si)__B, (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) { +__funline __m128i _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, + __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2vard128_mask((__v4si)__I /* idx */, (__v4si)__A, (__v4si)__B, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) { +__funline __m128i _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, + __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_vpermi2vard128_mask((__v4si)__A, (__v4si)__I /* idx */, @@ -4656,39 +3586,32 @@ extern __inline __m128i (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) { +__funline __m128i _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, + __m128i __I, __m128i __B) { return (__m128i)__builtin_ia32_vpermt2vard128_maskz((__v4si)__I /* idx */, (__v4si)__A, (__v4si)__B, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { +__funline __m256i _mm256_permutex2var_epi64(__m256i __A, __m256i __I, + __m256i __B) { return (__m256i)__builtin_ia32_vpermt2varq256_mask((__v4di)__I /* idx */, (__v4di)__A, (__v4di)__B, (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, - __m256i __B) { +__funline __m256i _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, + __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermt2varq256_mask((__v4di)__I /* idx */, (__v4di)__A, (__v4di)__B, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, - __m256i __B) { +__funline __m256i _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, + __mmask8 __U, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2varq256_mask((__v4di)__A, (__v4di)__I /* idx */, @@ -4696,39 +3619,32 @@ extern __inline __m256i (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, - __m256i __B) { +__funline __m256i _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, + __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermt2varq256_maskz((__v4di)__I /* idx */, (__v4di)__A, (__v4di)__B, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { +__funline __m256i _mm256_permutex2var_epi32(__m256i __A, __m256i __I, + __m256i __B) { return (__m256i)__builtin_ia32_vpermt2vard256_mask((__v8si)__I /* idx */, (__v8si)__A, (__v8si)__B, (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, - __m256i __B) { +__funline __m256i _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, + __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermt2vard256_mask((__v8si)__I /* idx */, (__v8si)__A, (__v8si)__B, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, - __m256i __B) { +__funline __m256i _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, + __mmask8 __U, __m256i __B) { return (__m256i)__builtin_ia32_vpermi2vard256_mask((__v8si)__A, (__v8si)__I /* idx */, @@ -4736,37 +3652,31 @@ extern __inline __m256i (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, - __m256i __B) { +__funline __m256i _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, + __m256i __I, __m256i __B) { return (__m256i)__builtin_ia32_vpermt2vard256_maskz((__v8si)__I /* idx */, (__v8si)__A, (__v8si)__B, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { +__funline __m128d _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { return (__m128d)__builtin_ia32_vpermt2varpd128_mask((__v2di)__I /* idx */, (__v2df)__A, (__v2df)__B, (__mmask8)-1); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { +__funline __m128d _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, + __m128d __B) { return (__m128d)__builtin_ia32_vpermt2varpd128_mask((__v2di)__I /* idx */, (__v2df)__A, (__v2df)__B, (__mmask8)__U); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { +__funline __m128d _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, + __mmask8 __U, __m128d __B) { return (__m128d)__builtin_ia32_vpermi2varpd128_mask((__v2df)__A, (__v2di)__I /* idx */, @@ -4774,36 +3684,31 @@ _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { (__mmask8)__U); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { +__funline __m128d _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, + __m128i __I, __m128d __B) { return (__m128d)__builtin_ia32_vpermt2varpd128_maskz((__v2di)__I /* idx */, (__v2df)__A, (__v2df)__B, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { +__funline __m128 _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { return (__m128)__builtin_ia32_vpermt2varps128_mask((__v4si)__I /* idx */, (__v4sf)__A, (__v4sf)__B, (__mmask8)-1); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { +__funline __m128 _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, + __m128 __B) { return (__m128)__builtin_ia32_vpermt2varps128_mask((__v4si)__I /* idx */, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { +__funline __m128 _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, + __m128 __B) { return (__m128)__builtin_ia32_vpermi2varps128_mask((__v4sf)__A, (__v4si)__I /* idx */, @@ -4811,1001 +3716,787 @@ _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { (__mmask8)__U); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { +__funline __m128 _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, + __m128 __B) { return (__m128)__builtin_ia32_vpermt2varps128_maskz((__v4si)__I /* idx */, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srav_epi64(__m128i __X, __m128i __Y) { +__funline __m128i _mm_srav_epi64(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psravq128_mask( (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_psravq128_mask((__v2di)__X, (__v2di)__Y, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psravq128_mask( (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psllv8si_mask((__v8si)__X, (__v8si)__Y, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psllv8si_mask( (__v8si)__X, (__v8si)__Y, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_psllv4si_mask((__v4si)__X, (__v4si)__Y, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psllv4si_mask( (__v4si)__X, (__v4si)__Y, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psllv4di_mask((__v4di)__X, (__v4di)__Y, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psllv4di_mask( (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_psllv2di_mask((__v2di)__X, (__v2di)__Y, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psllv2di_mask( (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psrav8si_mask((__v8si)__X, (__v8si)__Y, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psrav8si_mask( (__v8si)__X, (__v8si)__Y, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_psrav4si_mask((__v4si)__X, (__v4si)__Y, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrav4si_mask( (__v4si)__X, (__v4si)__Y, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psrlv8si_mask((__v8si)__X, (__v8si)__Y, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psrlv8si_mask( (__v8si)__X, (__v8si)__Y, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_psrlv4si_mask((__v4si)__X, (__v4si)__Y, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrlv4si_mask( (__v4si)__X, (__v4si)__Y, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psrlv4di_mask((__v4di)__X, (__v4di)__Y, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psrlv4di_mask( (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_psrlv2di_mask((__v2di)__X, (__v2di)__Y, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrlv2di_mask( (__v2di)__X, (__v2di)__Y, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rolv_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_rolv_epi32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_prolvd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_rolv_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_rolv_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_prolvd256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_rolv_epi32(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_rolv_epi32(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_prolvd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rolv_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_rolv_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_prolvd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rolv_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_rolv_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_prolvd128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rolv_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_rolv_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_prolvd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rorv_epi32(__m256i __A, __m256i __B) { +__funline __m256i _mm256_rorv_epi32(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_prorvd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_rorv_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_rorv_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_prorvd256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_rorv_epi32(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_rorv_epi32(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_prorvd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rorv_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_rorv_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_prorvd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rorv_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_rorv_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_prorvd128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rorv_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_rorv_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_prorvd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rolv_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_rolv_epi64(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_prolvq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_rolv_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_rolv_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_prolvq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_rolv_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_rolv_epi64(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_prolvq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rolv_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_rolv_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_prolvq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rolv_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_rolv_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_prolvq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rolv_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_rolv_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_prolvq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rorv_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_rorv_epi64(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_prorvq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_rorv_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_rorv_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_prorvq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_rorv_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_rorv_epi64(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_prorvq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rorv_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_rorv_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_prorvq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rorv_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_rorv_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_prorvq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rorv_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_rorv_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_prorvq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srav_epi64(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_srav_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psravq256_mask( (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psravq256_mask((__v4di)__X, (__v4di)__Y, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srav_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_srav_epi64(__mmask8 __U, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_psravq256_mask( (__v4di)__X, (__v4di)__Y, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pandq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, __U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pandq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_pd(), __U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pandq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, __U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pandq128_mask((__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_pd(), __U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pandnq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, __U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pandnq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_pd(), __U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pandnq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, __U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pandnq128_mask((__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_pd(), __U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_porq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_porq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_or_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_or_epi64(__m256i __A, __m256i __B) { return (__m256i)((__v4du)__A | (__v4du)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_porq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_porq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_or_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_or_epi64(__m128i __A, __m128i __B) { return (__m128i)((__v2du)__A | (__v2du)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pxorq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pxorq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_xor_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_xor_epi64(__m256i __A, __m256i __B) { return (__m256i)((__v4du)__A ^ (__v4du)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pxorq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pxorq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_xor_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_xor_epi64(__m128i __A, __m128i __B) { return (__m128i)((__v2du)__A ^ (__v2du)__B); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_maxpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_maxpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_maxps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_maxps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_divps_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_divps_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_divpd_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_divpd_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_minpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_divpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_minpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_minps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_divpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_divps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_minps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_divps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_minps_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_mulps_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_minps_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_mulps_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_maxps_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_maxps_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_minpd_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_minpd_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_maxpd_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_maxpd_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_mulpd_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_mulpd_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_mulps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_mulps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_mulpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_mulpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_max_epi64(__mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_max_epi64(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxsq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_max_epi64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_max_epi64(__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaxsq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_min_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_min_epi64(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminsq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_min_epi64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_min_epi64(__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pminsq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_min_epi64(__mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_min_epi64(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminsq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_max_epu64(__mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_max_epu64(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxuq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_max_epi64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_max_epi64(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxsq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_max_epu64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_max_epu64(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxuq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_max_epu64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_max_epu64(__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaxuq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_min_epu64(__m256i __A, __m256i __B) { +__funline __m256i _mm256_min_epu64(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminuq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_min_epu64(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_min_epu64(__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pminuq256_mask((__v4di)__A, (__v4di)__B, (__v4di)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_min_epu64(__mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_min_epu64(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminuq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxsd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaxsd256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminsd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pminsd256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pmaxud256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmaxud256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_pminud256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pminud256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_epi64(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_max_epi64(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxsq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_epi64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_max_epi64(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmaxsq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_min_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminsq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_epi64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_min_epi64(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pminsq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_epi64(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_min_epi64(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminsq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_epu64(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_max_epu64(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxuq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_max_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxsq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_epu64(__m128i __A, __m128i __B) { +__funline __m128i _mm_max_epu64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxuq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_epu64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_max_epu64(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmaxuq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_epu64(__m128i __A, __m128i __B) { +__funline __m128i _mm_min_epu64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminuq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_epu64(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_min_epu64(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pminuq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_epu64(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_min_epu64(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminuq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxsd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmaxsd128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminsd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pminsd128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxud128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmaxud128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminud128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pminud128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, __M); } @@ -5816,194 +4507,144 @@ extern __inline __m128i #define __DISABLE_AVX512VLCD__ #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_broadcastmb_epi64(__mmask8 __A) { +__funline __m128i _mm_broadcastmb_epi64(__mmask8 __A) { return (__m128i)__builtin_ia32_broadcastmb128(__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcastmb_epi64(__mmask8 __A) { +__funline __m256i _mm256_broadcastmb_epi64(__mmask8 __A) { return (__m256i)__builtin_ia32_broadcastmb256(__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_broadcastmw_epi32(__mmask16 __A) { +__funline __m128i _mm_broadcastmw_epi32(__mmask16 __A) { return (__m128i)__builtin_ia32_broadcastmw128(__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcastmw_epi32(__mmask16 __A) { +__funline __m256i _mm256_broadcastmw_epi32(__mmask16 __A) { return (__m256i)__builtin_ia32_broadcastmw256(__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_lzcnt_epi32(__m256i __A) { +__funline __m256i _mm256_lzcnt_epi32(__m256i __A) { return (__m256i)__builtin_ia32_vplzcntd_256_mask( (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_lzcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_lzcnt_epi32(__m256i __W, __mmask8 __U, + __m256i __A) { return (__m256i)__builtin_ia32_vplzcntd_256_mask((__v8si)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_lzcnt_epi32(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_lzcnt_epi32(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_vplzcntd_256_mask( (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_lzcnt_epi64(__m256i __A) { +__funline __m256i _mm256_lzcnt_epi64(__m256i __A) { return (__m256i)__builtin_ia32_vplzcntq_256_mask( (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_lzcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_lzcnt_epi64(__m256i __W, __mmask8 __U, + __m256i __A) { return (__m256i)__builtin_ia32_vplzcntq_256_mask((__v4di)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_vplzcntq_256_mask( (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_conflict_epi64(__m256i __A) { +__funline __m256i _mm256_conflict_epi64(__m256i __A) { return (__m256i)__builtin_ia32_vpconflictdi_256_mask( (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_conflict_epi64(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_conflict_epi64(__m256i __W, __mmask8 __U, + __m256i __A) { return (__m256i)__builtin_ia32_vpconflictdi_256_mask((__v4di)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_conflict_epi64(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_conflict_epi64(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_vpconflictdi_256_mask( (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_conflict_epi32(__m256i __A) { +__funline __m256i _mm256_conflict_epi32(__m256i __A) { return (__m256i)__builtin_ia32_vpconflictsi_256_mask( (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_conflict_epi32(__m256i __W, __mmask8 __U, __m256i __A) { +__funline __m256i _mm256_mask_conflict_epi32(__m256i __W, __mmask8 __U, + __m256i __A) { return (__m256i)__builtin_ia32_vpconflictsi_256_mask((__v8si)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_conflict_epi32(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_conflict_epi32(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_vpconflictsi_256_mask( (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_lzcnt_epi32(__m128i __A) { +__funline __m128i _mm_lzcnt_epi32(__m128i __A) { return (__m128i)__builtin_ia32_vplzcntd_128_mask( (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_lzcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_lzcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_vplzcntd_128_mask((__v4si)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_lzcnt_epi32(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_lzcnt_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_vplzcntd_128_mask( (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_lzcnt_epi64(__m128i __A) { +__funline __m128i _mm_lzcnt_epi64(__m128i __A) { return (__m128i)__builtin_ia32_vplzcntq_128_mask( (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_lzcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_lzcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_vplzcntq_128_mask((__v2di)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_lzcnt_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_lzcnt_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_vplzcntq_128_mask( (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_conflict_epi64(__m128i __A) { +__funline __m128i _mm_conflict_epi64(__m128i __A) { return (__m128i)__builtin_ia32_vpconflictdi_128_mask( (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_conflict_epi64(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_conflict_epi64(__m128i __W, __mmask8 __U, + __m128i __A) { return (__m128i)__builtin_ia32_vpconflictdi_128_mask((__v2di)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_conflict_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_conflict_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_vpconflictdi_128_mask( (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_conflict_epi32(__m128i __A) { +__funline __m128i _mm_conflict_epi32(__m128i __A) { return (__m128i)__builtin_ia32_vpconflictsi_128_mask( (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_conflict_epi32(__m128i __W, __mmask8 __U, __m128i __A) { +__funline __m128i _mm_mask_conflict_epi32(__m128i __W, __mmask8 __U, + __m128i __A) { return (__m128i)__builtin_ia32_vpconflictsi_128_mask((__v4si)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_conflict_epi32(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_conflict_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_vpconflictsi_128_mask( (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } @@ -6012,2805 +4653,2228 @@ extern __inline __m128i #pragma GCC pop_options #endif -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_unpcklpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_unpcklpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_unpcklpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_unpcklpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_unpcklps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_unpckhpd256_mask((__v4df)__A, (__v4df)__B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) { +__funline __m256d _mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, + __m256d __B) { return (__m256d)__builtin_ia32_unpckhpd256_mask( (__v4df)__A, (__v4df)__B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { return (__m128d)__builtin_ia32_unpckhpd128_mask((__v2df)__A, (__v2df)__B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { +__funline __m128d _mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_unpckhpd128_mask( (__v2df)__A, (__v2df)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { return (__m256)__builtin_ia32_unpckhps256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_unpckhps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_unpckhps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_unpckhps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) { +__funline __m128 _mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) { return (__m128)__builtin_ia32_vcvtph2ps_mask((__v8hi)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { +__funline __m128 _mm_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { return (__m128)__builtin_ia32_vcvtph2ps_mask( (__v8hi)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) { +__funline __m256 _mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_unpcklps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtph_ps(__m256 __W, __mmask8 __U, __m128i __A) { +__funline __m256 _mm256_mask_cvtph_ps(__m256 __W, __mmask8 __U, __m128i __A) { return (__m256)__builtin_ia32_vcvtph2ps256_mask((__v8hi)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { +__funline __m256 _mm256_maskz_cvtph_ps(__mmask8 __U, __m128i __A) { return (__m256)__builtin_ia32_vcvtph2ps256_mask( (__v8hi)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) { return (__m128)__builtin_ia32_unpcklps128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) { +__funline __m128 _mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_unpcklps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psrad256_mask((__v8si)__A, (__v4si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psrad256_mask( (__v8si)__A, (__v4si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psrad128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrad128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sra_epi64(__m256i __A, __m128i __B) { +__funline __m256i _mm256_sra_epi64(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psraq256_mask( (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psraq256_mask((__v4di)__A, (__v2di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psraq256_mask( (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sra_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_sra_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psraq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psraq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psraq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pslld128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pslld128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_psllq128_mask((__v2di)__A, (__v2di)__B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psllq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_pslld256_mask((__v8si)__A, (__v4si)__B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_pslld256_mask( (__v8si)__A, (__v4si)__B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) { return (__m256i)__builtin_ia32_psllq256_mask((__v4di)__A, (__v2di)__B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) { +__funline __m256i _mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psllq256_mask( (__v4di)__A, (__v2di)__B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) { +__funline __m256 _mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, + __m256 __Y) { return (__m256)__builtin_ia32_permvarsf256_mask((__v8sf)__Y, (__v8si)__X, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) { +__funline __m256 _mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, + __m256 __Y) { return (__m256)__builtin_ia32_permvarsf256_mask( (__v8sf)__Y, (__v8si)__X, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutexvar_pd(__m256i __X, __m256d __Y) { +__funline __m256d _mm256_permutexvar_pd(__m256i __X, __m256d __Y) { return (__m256d)__builtin_ia32_permvardf256_mask( (__v4df)__Y, (__v4di)__X, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutexvar_pd(__m256d __W, __mmask8 __U, __m256i __X, - __m256d __Y) { +__funline __m256d _mm256_mask_permutexvar_pd(__m256d __W, __mmask8 __U, + __m256i __X, __m256d __Y) { return (__m256d)__builtin_ia32_permvardf256_mask((__v4df)__Y, (__v4di)__X, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutexvar_pd(__mmask8 __U, __m256i __X, __m256d __Y) { +__funline __m256d _mm256_maskz_permutexvar_pd(__mmask8 __U, __m256i __X, + __m256d __Y) { return (__m256d)__builtin_ia32_permvardf256_mask( (__v4df)__Y, (__v4di)__X, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) { +__funline __m256d _mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, + __m256d __A, __m256i __C) { return (__m256d)__builtin_ia32_vpermilvarpd256_mask( (__v4df)__A, (__v4di)__C, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) { +__funline __m256d _mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, + __m256i __C) { return (__m256d)__builtin_ia32_vpermilvarpd256_mask( (__v4df)__A, (__v4di)__C, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) { +__funline __m256 _mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256i __C) { return (__m256)__builtin_ia32_vpermilvarps256_mask( (__v8sf)__A, (__v8si)__C, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) { +__funline __m256 _mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, + __m256i __C) { return (__m256)__builtin_ia32_vpermilvarps256_mask( (__v8sf)__A, (__v8si)__C, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) { +__funline __m128d _mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128i __C) { return (__m128d)__builtin_ia32_vpermilvarpd_mask((__v2df)__A, (__v2di)__C, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) { +__funline __m128d _mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, + __m128i __C) { return (__m128d)__builtin_ia32_vpermilvarpd_mask( (__v2df)__A, (__v2di)__C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) { +__funline __m128 _mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128i __C) { return (__m128)__builtin_ia32_vpermilvarps_mask((__v4sf)__A, (__v4si)__C, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) { +__funline __m128 _mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) { return (__m128)__builtin_ia32_vpermilvarps_mask( (__v4sf)__A, (__v4si)__C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmulld256_mask( (__v8si)__A, (__v8si)__B, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutexvar_epi64(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_permutexvar_epi64(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_permvardi256_mask( (__v4di)__Y, (__v4di)__X, (__v4di)_mm256_setzero_si256(), __M); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { +__funline __m256i _mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_pmulld256_mask((__v8si)__A, (__v8si)__B, (__v8si)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmulld128_mask( (__v4si)__A, (__v4si)__B, (__v4si)_mm_setzero_si128(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { +__funline __m128i _mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) { return (__m128i)__builtin_ia32_pmulld128_mask((__v4si)__A, (__v4si)__B, (__v4si)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_pmuldq256_mask((__v8si)__X, (__v8si)__Y, (__v4di)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_pmuldq256_mask( (__v8si)__X, (__v8si)__Y, (__v4di)_mm256_setzero_si256(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_pmuldq128_mask((__v4si)__X, (__v4si)__Y, (__v2di)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pmuldq128_mask( (__v4si)__X, (__v4si)__Y, (__v2di)_mm_setzero_si128(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutexvar_epi64(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_permutexvar_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_permvardi256_mask( (__v4di)__Y, (__v4di)__X, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutexvar_epi64(__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) { +__funline __m256i _mm256_mask_permutexvar_epi64(__m256i __W, __mmask8 __M, + __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_permvardi256_mask((__v4di)__Y, (__v4di)__X, (__v4di)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_pmuludq256_mask((__v8si)__X, (__v8si)__Y, (__v4di)__W, __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__m256i)__builtin_ia32_permvarsi256_mask( (__v8si)__Y, (__v8si)__X, (__v8si)_mm256_setzero_si256(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __m256i _mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_pmuludq256_mask( (__v8si)__X, (__v8si)__Y, (__v4di)_mm256_setzero_si256(), __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { +__funline __m128i _mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) { return (__m128i)__builtin_ia32_pmuludq128_mask((__v4si)__X, (__v4si)__Y, (__v2di)__W, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __m128i _mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pmuludq128_mask( (__v4si)__X, (__v4si)__Y, (__v2di)_mm_setzero_si128(), __M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutexvar_epi32(__m256i __X, __m256i __Y) { +__funline __m256i _mm256_permutexvar_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_permvarsi256_mask( (__v8si)__Y, (__v8si)__X, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) { +__funline __m256i _mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, + __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_permvarsi256_mask((__v8si)__Y, (__v8si)__X, (__v8si)__W, __M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpneq_epu32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmpneq_epu32_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpneq_epu32_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmpneq_epu32_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmplt_epu32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmplt_epu32_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmplt_epu32_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmplt_epu32_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpge_epu32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmpge_epu32_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpge_epu32_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmpge_epu32_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmple_epu32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmple_epu32_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 2, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmple_epu32_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmple_epu32_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, 2, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpneq_epu64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmpneq_epu64_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpneq_epu64_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmpneq_epu64_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmplt_epu64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmplt_epu64_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmplt_epu64_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmplt_epu64_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpge_epu64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmpge_epu64_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpge_epu64_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmpge_epu64_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmple_epu64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmple_epu64_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 2, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmple_epu64_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmple_epu64_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, 2, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpneq_epi32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmpneq_epi32_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpneq_epi32_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmpneq_epi32_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmplt_epi32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmplt_epi32_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmplt_epi32_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmplt_epi32_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpge_epi32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmpge_epi32_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpge_epi32_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmpge_epi32_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmple_epi32_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmple_epi32_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 2, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmple_epi32_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmple_epi32_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, 2, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpneq_epi64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmpneq_epi64_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpneq_epi64_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmpneq_epi64_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmplt_epi64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmplt_epi64_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmplt_epi64_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmplt_epi64_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmpge_epi64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmpge_epi64_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmpge_epi64_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmpge_epi64_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmple_epi64_mask(__mmask8 __M, __m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_mask_cmple_epi64_mask(__mmask8 __M, __m256i __X, + __m256i __Y) { return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 2, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmple_epi64_mask(__m256i __X, __m256i __Y) { +__funline __mmask8 _mm256_cmple_epi64_mask(__m256i __X, __m256i __Y) { return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, 2, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpneq_epu32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpneq_epu32_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_epu32_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpneq_epu32_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmplt_epu32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmplt_epu32_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epu32_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmplt_epu32_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpge_epu32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpge_epu32_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_epu32_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpge_epu32_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmple_epu32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmple_epu32_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 2, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_epu32_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmple_epu32_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, 2, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpneq_epu64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpneq_epu64_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_epu64_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpneq_epu64_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmplt_epu64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmplt_epu64_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epu64_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmplt_epu64_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpge_epu64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpge_epu64_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_epu64_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpge_epu64_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmple_epu64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmple_epu64_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 2, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_epu64_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmple_epu64_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, 2, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpneq_epi32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpneq_epi32_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_epi32_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpneq_epi32_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmplt_epi32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmplt_epi32_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epi32_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmplt_epi32_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpge_epi32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpge_epi32_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_epi32_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpge_epi32_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmple_epi32_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmple_epi32_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 2, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_epi32_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmple_epi32_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, 2, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpneq_epi64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpneq_epi64_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 4, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_epi64_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpneq_epi64_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 4, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmplt_epi64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmplt_epi64_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 1, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epi64_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmplt_epi64_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 1, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmpge_epi64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmpge_epi64_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 5, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_epi64_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmpge_epi64_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 5, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmple_epi64_mask(__mmask8 __M, __m128i __X, __m128i __Y) { +__funline __mmask8 _mm_mask_cmple_epi64_mask(__mmask8 __M, __m128i __X, + __m128i __Y) { return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 2, (__mmask8)__M); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_epi64_mask(__m128i __X, __m128i __Y) { +__funline __mmask8 _mm_cmple_epi64_mask(__m128i __X, __m128i __Y) { return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, 2, (__mmask8)-1); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutex_epi64(__m256i __X, const int __I) { +__funline __m256i _mm256_permutex_epi64(__m256i __X, const int __I) { return (__m256i)__builtin_ia32_permdi256_mask( (__v4di)__X, __I, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutex_epi64(__m256i __W, __mmask8 __M, __m256i __X, - const int __I) { +__funline __m256i _mm256_mask_permutex_epi64(__m256i __W, __mmask8 __M, + __m256i __X, const int __I) { return (__m256i)__builtin_ia32_permdi256_mask((__v4di)__X, __I, (__v4di)__W, (__mmask8)__M); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutex_epi64(__mmask8 __M, __m256i __X, const int __I) { +__funline __m256i _mm256_maskz_permutex_epi64(__mmask8 __M, __m256i __X, + const int __I) { return (__m256i)__builtin_ia32_permdi256_mask( (__v4di)__X, __I, (__v4di)_mm256_setzero_si256(), (__mmask8)__M); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shuffle_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B, - const int __imm) { +__funline __m256d _mm256_mask_shuffle_pd(__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B, const int __imm) { return (__m256d)__builtin_ia32_shufpd256_mask((__v4df)__A, (__v4df)__B, __imm, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shuffle_pd(__mmask8 __U, __m256d __A, __m256d __B, - const int __imm) { +__funline __m256d _mm256_maskz_shuffle_pd(__mmask8 __U, __m256d __A, __m256d __B, + const int __imm) { return (__m256d)__builtin_ia32_shufpd256_mask((__v4df)__A, (__v4df)__B, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shuffle_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, - const int __imm) { +__funline __m128d _mm_mask_shuffle_pd(__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __imm) { return (__m128d)__builtin_ia32_shufpd128_mask((__v2df)__A, (__v2df)__B, __imm, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_shuffle_pd(__mmask8 __U, __m128d __A, __m128d __B, const int __imm) { +__funline __m128d _mm_maskz_shuffle_pd(__mmask8 __U, __m128d __A, __m128d __B, + const int __imm) { return (__m128d)__builtin_ia32_shufpd128_mask( (__v2df)__A, (__v2df)__B, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shuffle_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B, - const int __imm) { +__funline __m256 _mm256_mask_shuffle_ps(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B, const int __imm) { return (__m256)__builtin_ia32_shufps256_mask((__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_shuffle_ps(__mmask8 __U, __m256 __A, __m256 __B, const int __imm) { +__funline __m256 _mm256_maskz_shuffle_ps(__mmask8 __U, __m256 __A, __m256 __B, + const int __imm) { return (__m256)__builtin_ia32_shufps256_mask((__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shuffle_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, - const int __imm) { +__funline __m128 _mm_mask_shuffle_ps(__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __imm) { return (__m128)__builtin_ia32_shufps128_mask((__v4sf)__A, (__v4sf)__B, __imm, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_shuffle_ps(__mmask8 __U, __m128 __A, __m128 __B, const int __imm) { +__funline __m128 _mm_maskz_shuffle_ps(__mmask8 __U, __m128 __A, __m128 __B, + const int __imm) { return (__m128)__builtin_ia32_shufps128_mask( (__v4sf)__A, (__v4sf)__B, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_inserti32x4(__m256i __A, __m128i __B, const int __imm) { +__funline __m256i _mm256_inserti32x4(__m256i __A, __m128i __B, const int __imm) { return (__m256i)__builtin_ia32_inserti32x4_256_mask( (__v8si)__A, (__v4si)__B, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_inserti32x4(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B, - const int __imm) { +__funline __m256i _mm256_mask_inserti32x4(__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B, const int __imm) { return (__m256i)__builtin_ia32_inserti32x4_256_mask( (__v8si)__A, (__v4si)__B, __imm, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_inserti32x4(__mmask8 __U, __m256i __A, __m128i __B, - const int __imm) { +__funline __m256i _mm256_maskz_inserti32x4(__mmask8 __U, __m256i __A, __m128i __B, + const int __imm) { return (__m256i)__builtin_ia32_inserti32x4_256_mask( (__v8si)__A, (__v4si)__B, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_insertf32x4(__m256 __A, __m128 __B, const int __imm) { +__funline __m256 _mm256_insertf32x4(__m256 __A, __m128 __B, const int __imm) { return (__m256)__builtin_ia32_insertf32x4_256_mask( (__v8sf)__A, (__v4sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_insertf32x4(__m256 __W, __mmask8 __U, __m256 __A, __m128 __B, - const int __imm) { +__funline __m256 _mm256_mask_insertf32x4(__m256 __W, __mmask8 __U, __m256 __A, + __m128 __B, const int __imm) { return (__m256)__builtin_ia32_insertf32x4_256_mask( (__v8sf)__A, (__v4sf)__B, __imm, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_insertf32x4(__mmask8 __U, __m256 __A, __m128 __B, - const int __imm) { +__funline __m256 _mm256_maskz_insertf32x4(__mmask8 __U, __m256 __A, __m128 __B, + const int __imm) { return (__m256)__builtin_ia32_insertf32x4_256_mask( (__v8sf)__A, (__v4sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extracti32x4_epi32(__m256i __A, const int __imm) { +__funline __m128i _mm256_extracti32x4_epi32(__m256i __A, const int __imm) { return (__m128i)__builtin_ia32_extracti32x4_256_mask( (__v8si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A, - const int __imm) { +__funline __m128i _mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, + __m256i __A, const int __imm) { return (__m128i)__builtin_ia32_extracti32x4_256_mask( (__v8si)__A, __imm, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A, const int __imm) { +__funline __m128i _mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A, + const int __imm) { return (__m128i)__builtin_ia32_extracti32x4_256_mask( (__v8si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extractf32x4_ps(__m256 __A, const int __imm) { +__funline __m128 _mm256_extractf32x4_ps(__m256 __A, const int __imm) { return (__m128)__builtin_ia32_extractf32x4_256_mask( (__v8sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A, - const int __imm) { +__funline __m128 _mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A, + const int __imm) { return (__m128)__builtin_ia32_extractf32x4_256_mask( (__v8sf)__A, __imm, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A, const int __imm) { +__funline __m128 _mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A, + const int __imm) { return (__m128)__builtin_ia32_extractf32x4_256_mask( (__v8sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shuffle_i64x2(__m256i __A, __m256i __B, const int __imm) { +__funline __m256i _mm256_shuffle_i64x2(__m256i __A, __m256i __B, + const int __imm) { return (__m256i)__builtin_ia32_shuf_i64x2_256_mask( (__v4di)__A, (__v4di)__B, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B, const int __imm) { +__funline __m256i _mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B, + const int __imm) { return (__m256i)__builtin_ia32_shuf_i64x2_256_mask( (__v4di)__A, (__v4di)__B, __imm, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B, - const int __imm) { +__funline __m256i _mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, + __m256i __B, const int __imm) { return (__m256i)__builtin_ia32_shuf_i64x2_256_mask( (__v4di)__A, (__v4di)__B, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shuffle_i32x4(__m256i __A, __m256i __B, const int __imm) { +__funline __m256i _mm256_shuffle_i32x4(__m256i __A, __m256i __B, + const int __imm) { return (__m256i)__builtin_ia32_shuf_i32x4_256_mask( (__v8si)__A, (__v8si)__B, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B, const int __imm) { +__funline __m256i _mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, + __m256i __A, __m256i __B, + const int __imm) { return (__m256i)__builtin_ia32_shuf_i32x4_256_mask( (__v8si)__A, (__v8si)__B, __imm, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B, - const int __imm) { +__funline __m256i _mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, + __m256i __B, const int __imm) { return (__m256i)__builtin_ia32_shuf_i32x4_256_mask( (__v8si)__A, (__v8si)__B, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shuffle_f64x2(__m256d __A, __m256d __B, const int __imm) { +__funline __m256d _mm256_shuffle_f64x2(__m256d __A, __m256d __B, + const int __imm) { return (__m256d)__builtin_ia32_shuf_f64x2_256_mask( (__v4df)__A, (__v4df)__B, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B, const int __imm) { +__funline __m256d _mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, + __m256d __A, __m256d __B, + const int __imm) { return (__m256d)__builtin_ia32_shuf_f64x2_256_mask( (__v4df)__A, (__v4df)__B, __imm, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B, - const int __imm) { +__funline __m256d _mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, + __m256d __B, const int __imm) { return (__m256d)__builtin_ia32_shuf_f64x2_256_mask( (__v4df)__A, (__v4df)__B, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shuffle_f32x4(__m256 __A, __m256 __B, const int __imm) { +__funline __m256 _mm256_shuffle_f32x4(__m256 __A, __m256 __B, const int __imm) { return (__m256)__builtin_ia32_shuf_f32x4_256_mask( (__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B, - const int __imm) { +__funline __m256 _mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B, const int __imm) { return (__m256)__builtin_ia32_shuf_f32x4_256_mask( (__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B, - const int __imm) { +__funline __m256 _mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B, + const int __imm) { return (__m256)__builtin_ia32_shuf_f32x4_256_mask( (__v8sf)__A, (__v8sf)__B, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fixupimm_pd(__m256d __A, __m256d __B, __m256i __C, const int __imm) { +__funline __m256d _mm256_fixupimm_pd(__m256d __A, __m256d __B, __m256i __C, + const int __imm) { return (__m256d)__builtin_ia32_fixupimmpd256_mask( (__v4df)__A, (__v4df)__B, (__v4di)__C, __imm, (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fixupimm_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256i __C, - const int __imm) { +__funline __m256d _mm256_mask_fixupimm_pd(__m256d __A, __mmask8 __U, __m256d __B, + __m256i __C, const int __imm) { return (__m256d)__builtin_ia32_fixupimmpd256_mask( (__v4df)__A, (__v4df)__B, (__v4di)__C, __imm, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_fixupimm_pd(__mmask8 __U, __m256d __A, __m256d __B, - __m256i __C, const int __imm) { +__funline __m256d _mm256_maskz_fixupimm_pd(__mmask8 __U, __m256d __A, __m256d __B, + __m256i __C, const int __imm) { return (__m256d)__builtin_ia32_fixupimmpd256_maskz( (__v4df)__A, (__v4df)__B, (__v4di)__C, __imm, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fixupimm_ps(__m256 __A, __m256 __B, __m256i __C, const int __imm) { +__funline __m256 _mm256_fixupimm_ps(__m256 __A, __m256 __B, __m256i __C, + const int __imm) { return (__m256)__builtin_ia32_fixupimmps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8si)__C, __imm, (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_fixupimm_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256i __C, - const int __imm) { +__funline __m256 _mm256_mask_fixupimm_ps(__m256 __A, __mmask8 __U, __m256 __B, + __m256i __C, const int __imm) { return (__m256)__builtin_ia32_fixupimmps256_mask( (__v8sf)__A, (__v8sf)__B, (__v8si)__C, __imm, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_fixupimm_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256i __C, - const int __imm) { +__funline __m256 _mm256_maskz_fixupimm_ps(__mmask8 __U, __m256 __A, __m256 __B, + __m256i __C, const int __imm) { return (__m256)__builtin_ia32_fixupimmps256_maskz( (__v8sf)__A, (__v8sf)__B, (__v8si)__C, __imm, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C, const int __imm) { +__funline __m128d _mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C, + const int __imm) { return (__m128d)__builtin_ia32_fixupimmpd128_mask( (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fixupimm_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C, - const int __imm) { +__funline __m128d _mm_mask_fixupimm_pd(__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm) { return (__m128d)__builtin_ia32_fixupimmpd128_mask( (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fixupimm_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C, - const int __imm) { +__funline __m128d _mm_maskz_fixupimm_pd(__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm) { return (__m128d)__builtin_ia32_fixupimmpd128_maskz( (__v2df)__A, (__v2df)__B, (__v2di)__C, __imm, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fixupimm_ps(__m128 __A, __m128 __B, __m128i __C, const int __imm) { +__funline __m128 _mm_fixupimm_ps(__m128 __A, __m128 __B, __m128i __C, + const int __imm) { return (__m128)__builtin_ia32_fixupimmps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_fixupimm_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C, - const int __imm) { +__funline __m128 _mm_mask_fixupimm_ps(__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm) { return (__m128)__builtin_ia32_fixupimmps128_mask( (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_fixupimm_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C, - const int __imm) { +__funline __m128 _mm_maskz_fixupimm_ps(__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm) { return (__m128)__builtin_ia32_fixupimmps128_maskz( (__v4sf)__A, (__v4sf)__B, (__v4si)__C, __imm, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, - const int __imm) { +__funline __m256i _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psrldi256_mask((__v8si)__A, __imm, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, const int __imm) { +__funline __m256i _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psrldi256_mask( (__v8si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psrldi128_mask((__v4si)__A, __imm, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psrldi128_mask( (__v4si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, - const int __imm) { +__funline __m256i _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psrlqi256_mask((__v4di)__A, __imm, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, const int __imm) { +__funline __m256i _mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psrlqi256_mask( (__v4di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psrlqi128_mask((__v2di)__A, __imm, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psrlqi128_mask( (__v2di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_ternarylogic_epi64(__m256i __A, __m256i __B, __m256i __C, - const int __imm) { +__funline __m256i _mm256_ternarylogic_epi64(__m256i __A, __m256i __B, __m256i __C, + const int __imm) { return (__m256i)__builtin_ia32_pternlogq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)__C, __imm, (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_ternarylogic_epi64(__m256i __A, __mmask8 __U, __m256i __B, - __m256i __C, const int __imm) { +__funline __m256i _mm256_mask_ternarylogic_epi64(__m256i __A, __mmask8 __U, + __m256i __B, __m256i __C, + const int __imm) { return (__m256i)__builtin_ia32_pternlogq256_mask( (__v4di)__A, (__v4di)__B, (__v4di)__C, __imm, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_ternarylogic_epi64(__mmask8 __U, __m256i __A, __m256i __B, - __m256i __C, const int __imm) { +__funline __m256i _mm256_maskz_ternarylogic_epi64(__mmask8 __U, __m256i __A, + __m256i __B, __m256i __C, + const int __imm) { return (__m256i)__builtin_ia32_pternlogq256_maskz( (__v4di)__A, (__v4di)__B, (__v4di)__C, __imm, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_ternarylogic_epi32(__m256i __A, __m256i __B, __m256i __C, - const int __imm) { +__funline __m256i _mm256_ternarylogic_epi32(__m256i __A, __m256i __B, __m256i __C, + const int __imm) { return (__m256i)__builtin_ia32_pternlogd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)__C, __imm, (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_ternarylogic_epi32(__m256i __A, __mmask8 __U, __m256i __B, - __m256i __C, const int __imm) { +__funline __m256i _mm256_mask_ternarylogic_epi32(__m256i __A, __mmask8 __U, + __m256i __B, __m256i __C, + const int __imm) { return (__m256i)__builtin_ia32_pternlogd256_mask( (__v8si)__A, (__v8si)__B, (__v8si)__C, __imm, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_ternarylogic_epi32(__mmask8 __U, __m256i __A, __m256i __B, - __m256i __C, const int __imm) { +__funline __m256i _mm256_maskz_ternarylogic_epi32(__mmask8 __U, __m256i __A, + __m256i __B, __m256i __C, + const int __imm) { return (__m256i)__builtin_ia32_pternlogd256_maskz( (__v8si)__A, (__v8si)__B, (__v8si)__C, __imm, (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_ternarylogic_epi64(__m128i __A, __m128i __B, __m128i __C, const int __imm) { +__funline __m128i _mm_ternarylogic_epi64(__m128i __A, __m128i __B, __m128i __C, + const int __imm) { return (__m128i)__builtin_ia32_pternlogq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)__C, __imm, (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_ternarylogic_epi64(__m128i __A, __mmask8 __U, __m128i __B, - __m128i __C, const int __imm) { +__funline __m128i _mm_mask_ternarylogic_epi64(__m128i __A, __mmask8 __U, + __m128i __B, __m128i __C, + const int __imm) { return (__m128i)__builtin_ia32_pternlogq128_mask( (__v2di)__A, (__v2di)__B, (__v2di)__C, __imm, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_ternarylogic_epi64(__mmask8 __U, __m128i __A, __m128i __B, - __m128i __C, const int __imm) { +__funline __m128i _mm_maskz_ternarylogic_epi64(__mmask8 __U, __m128i __A, + __m128i __B, __m128i __C, + const int __imm) { return (__m128i)__builtin_ia32_pternlogq128_maskz( (__v2di)__A, (__v2di)__B, (__v2di)__C, __imm, (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_ternarylogic_epi32(__m128i __A, __m128i __B, __m128i __C, const int __imm) { +__funline __m128i _mm_ternarylogic_epi32(__m128i __A, __m128i __B, __m128i __C, + const int __imm) { return (__m128i)__builtin_ia32_pternlogd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)__C, __imm, (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_ternarylogic_epi32(__m128i __A, __mmask8 __U, __m128i __B, - __m128i __C, const int __imm) { +__funline __m128i _mm_mask_ternarylogic_epi32(__m128i __A, __mmask8 __U, + __m128i __B, __m128i __C, + const int __imm) { return (__m128i)__builtin_ia32_pternlogd128_mask( (__v4si)__A, (__v4si)__B, (__v4si)__C, __imm, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_ternarylogic_epi32(__mmask8 __U, __m128i __A, __m128i __B, - __m128i __C, const int __imm) { +__funline __m128i _mm_maskz_ternarylogic_epi32(__mmask8 __U, __m128i __A, + __m128i __B, __m128i __C, + const int __imm) { return (__m128i)__builtin_ia32_pternlogd128_maskz( (__v4si)__A, (__v4si)__B, (__v4si)__C, __imm, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_roundscale_ps(__m256 __A, const int __imm) { +__funline __m256 _mm256_roundscale_ps(__m256 __A, const int __imm) { return (__m256)__builtin_ia32_rndscaleps_256_mask( (__v8sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_roundscale_ps(__m256 __W, __mmask8 __U, __m256 __A, - const int __imm) { +__funline __m256 _mm256_mask_roundscale_ps(__m256 __W, __mmask8 __U, __m256 __A, + const int __imm) { return (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)__A, __imm, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_roundscale_ps(__mmask8 __U, __m256 __A, const int __imm) { +__funline __m256 _mm256_maskz_roundscale_ps(__mmask8 __U, __m256 __A, + const int __imm) { return (__m256)__builtin_ia32_rndscaleps_256_mask( (__v8sf)__A, __imm, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_roundscale_pd(__m256d __A, const int __imm) { +__funline __m256d _mm256_roundscale_pd(__m256d __A, const int __imm) { return (__m256d)__builtin_ia32_rndscalepd_256_mask( (__v4df)__A, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_roundscale_pd(__m256d __W, __mmask8 __U, __m256d __A, - const int __imm) { +__funline __m256d _mm256_mask_roundscale_pd(__m256d __W, __mmask8 __U, + __m256d __A, const int __imm) { return (__m256d)__builtin_ia32_rndscalepd_256_mask( (__v4df)__A, __imm, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_roundscale_pd(__mmask8 __U, __m256d __A, const int __imm) { +__funline __m256d _mm256_maskz_roundscale_pd(__mmask8 __U, __m256d __A, + const int __imm) { return (__m256d)__builtin_ia32_rndscalepd_256_mask( (__v4df)__A, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_roundscale_ps(__m128 __A, const int __imm) { +__funline __m128 _mm_roundscale_ps(__m128 __A, const int __imm) { return (__m128)__builtin_ia32_rndscaleps_128_mask( (__v4sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_roundscale_ps(__m128 __W, __mmask8 __U, __m128 __A, const int __imm) { +__funline __m128 _mm_mask_roundscale_ps(__m128 __W, __mmask8 __U, __m128 __A, + const int __imm) { return (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)__A, __imm, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_roundscale_ps(__mmask8 __U, __m128 __A, const int __imm) { +__funline __m128 _mm_maskz_roundscale_ps(__mmask8 __U, __m128 __A, + const int __imm) { return (__m128)__builtin_ia32_rndscaleps_128_mask( (__v4sf)__A, __imm, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_roundscale_pd(__m128d __A, const int __imm) { +__funline __m128d _mm_roundscale_pd(__m128d __A, const int __imm) { return (__m128d)__builtin_ia32_rndscalepd_128_mask( (__v2df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_roundscale_pd(__m128d __W, __mmask8 __U, __m128d __A, - const int __imm) { +__funline __m128d _mm_mask_roundscale_pd(__m128d __W, __mmask8 __U, __m128d __A, + const int __imm) { return (__m128d)__builtin_ia32_rndscalepd_128_mask( (__v2df)__A, __imm, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_roundscale_pd(__mmask8 __U, __m128d __A, const int __imm) { +__funline __m128d _mm_maskz_roundscale_pd(__mmask8 __U, __m128d __A, + const int __imm) { return (__m128d)__builtin_ia32_rndscalepd_128_mask( (__v2df)__A, __imm, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_getmant_ps(__m256 __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m256 _mm256_getmant_ps(__m256 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m256)__builtin_ia32_getmantps256_mask( (__v8sf)__A, (__C << 2) | __B, (__v8sf)_mm256_setzero_ps(), (__mmask8)-1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_getmant_ps(__m256 __W, __mmask8 __U, __m256 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m256 _mm256_mask_getmant_ps(__m256 __W, __mmask8 __U, __m256 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m256)__builtin_ia32_getmantps256_mask((__v8sf)__A, (__C << 2) | __B, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_getmant_ps(__mmask8 __U, __m256 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m256 _mm256_maskz_getmant_ps(__mmask8 __U, __m256 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m256)__builtin_ia32_getmantps256_mask((__v8sf)__A, (__C << 2) | __B, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getmant_ps(__m128 __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m128 _mm_getmant_ps(__m128 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m128)__builtin_ia32_getmantps128_mask( (__v4sf)__A, (__C << 2) | __B, (__v4sf)_mm_setzero_ps(), (__mmask8)-1); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getmant_ps(__m128 __W, __mmask8 __U, __m128 __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m128 _mm_mask_getmant_ps(__m128 __W, __mmask8 __U, __m128 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m128)__builtin_ia32_getmantps128_mask((__v4sf)__A, (__C << 2) | __B, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getmant_ps(__mmask8 __U, __m128 __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m128 _mm_maskz_getmant_ps(__mmask8 __U, __m128 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m128)__builtin_ia32_getmantps128_mask( (__v4sf)__A, (__C << 2) | __B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_getmant_pd(__m256d __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m256d _mm256_getmant_pd(__m256d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m256d)__builtin_ia32_getmantpd256_mask( (__v4df)__A, (__C << 2) | __B, (__v4df)_mm256_setzero_pd(), (__mmask8)-1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_getmant_pd(__m256d __W, __mmask8 __U, __m256d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m256d _mm256_mask_getmant_pd(__m256d __W, __mmask8 __U, __m256d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m256d)__builtin_ia32_getmantpd256_mask( (__v4df)__A, (__C << 2) | __B, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_getmant_pd(__mmask8 __U, __m256d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m256d _mm256_maskz_getmant_pd(__mmask8 __U, __m256d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m256d)__builtin_ia32_getmantpd256_mask( (__v4df)__A, (__C << 2) | __B, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getmant_pd(__m128d __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m128d _mm_getmant_pd(__m128d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m128d)__builtin_ia32_getmantpd128_mask( (__v2df)__A, (__C << 2) | __B, (__v2df)_mm_setzero_pd(), (__mmask8)-1); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_getmant_pd(__m128d __W, __mmask8 __U, __m128d __A, - _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m128d _mm_mask_getmant_pd(__m128d __W, __mmask8 __U, __m128d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m128d)__builtin_ia32_getmantpd128_mask( (__v2df)__A, (__C << 2) | __B, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_getmant_pd(__mmask8 __U, __m128d __A, _MM_MANTISSA_NORM_ENUM __B, - _MM_MANTISSA_SIGN_ENUM __C) { +__funline __m128d _mm_maskz_getmant_pd(__mmask8 __U, __m128d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) { return (__m128d)__builtin_ia32_getmantpd128_mask( (__v2df)__A, (__C << 2) | __B, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mmask_i32gather_ps(__m256 __v1_old, __mmask8 __mask, __m256i __index, - void const *__addr, int __scale) { +__funline __m256 _mm256_mmask_i32gather_ps(__m256 __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { return (__m256)__builtin_ia32_gather3siv8sf((__v8sf)__v1_old, __addr, (__v8si)__index, __mask, __scale); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mmask_i32gather_ps(__m128 __v1_old, __mmask8 __mask, __m128i __index, - void const *__addr, int __scale) { +__funline __m128 _mm_mmask_i32gather_ps(__m128 __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { return (__m128)__builtin_ia32_gather3siv4sf((__v4sf)__v1_old, __addr, (__v4si)__index, __mask, __scale); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mmask_i32gather_pd(__m256d __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { +__funline __m256d _mm256_mmask_i32gather_pd(__m256d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { return (__m256d)__builtin_ia32_gather3siv4df( (__v4df)__v1_old, __addr, (__v4si)__index, __mask, __scale); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mmask_i32gather_pd(__m128d __v1_old, __mmask8 __mask, __m128i __index, - void const *__addr, int __scale) { +__funline __m128d _mm_mmask_i32gather_pd(__m128d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { return (__m128d)__builtin_ia32_gather3siv2df( (__v2df)__v1_old, __addr, (__v4si)__index, __mask, __scale); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, __m256i __index, - void const *__addr, int __scale) { +__funline __m128 _mm256_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { return (__m128)__builtin_ia32_gather3div8sf((__v4sf)__v1_old, __addr, (__v4di)__index, __mask, __scale); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, __m128i __index, - void const *__addr, int __scale) { +__funline __m128 _mm_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { return (__m128)__builtin_ia32_gather3div4sf((__v4sf)__v1_old, __addr, (__v2di)__index, __mask, __scale); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mmask_i64gather_pd(__m256d __v1_old, __mmask8 __mask, - __m256i __index, void const *__addr, - int __scale) { +__funline __m256d _mm256_mmask_i64gather_pd(__m256d __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) { return (__m256d)__builtin_ia32_gather3div4df( (__v4df)__v1_old, __addr, (__v4di)__index, __mask, __scale); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mmask_i64gather_pd(__m128d __v1_old, __mmask8 __mask, __m128i __index, - void const *__addr, int __scale) { +__funline __m128d _mm_mmask_i64gather_pd(__m128d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { return (__m128d)__builtin_ia32_gather3div2df( (__v2df)__v1_old, __addr, (__v2di)__index, __mask, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mmask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, - __m256i __index, void const *__addr, - int __scale) { +__funline __m256i _mm256_mmask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, + __m256i __index, + void const *__addr, int __scale) { return (__m256i)__builtin_ia32_gather3siv8si( (__v8si)__v1_old, __addr, (__v8si)__index, __mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mmask_i32gather_epi32(__m128i __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { +__funline __m128i _mm_mmask_i32gather_epi32(__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { return (__m128i)__builtin_ia32_gather3siv4si( (__v4si)__v1_old, __addr, (__v4si)__index, __mask, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mmask_i32gather_epi64(__m256i __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { +__funline __m256i _mm256_mmask_i32gather_epi64(__m256i __v1_old, __mmask8 __mask, + __m128i __index, + void const *__addr, int __scale) { return (__m256i)__builtin_ia32_gather3siv4di( (__v4di)__v1_old, __addr, (__v4si)__index, __mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mmask_i32gather_epi64(__m128i __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { +__funline __m128i _mm_mmask_i32gather_epi64(__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { return (__m128i)__builtin_ia32_gather3siv2di( (__v2di)__v1_old, __addr, (__v4si)__index, __mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, - __m256i __index, void const *__addr, - int __scale) { +__funline __m128i _mm256_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, + __m256i __index, + void const *__addr, int __scale) { return (__m128i)__builtin_ia32_gather3div8si( (__v4si)__v1_old, __addr, (__v4di)__index, __mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { +__funline __m128i _mm_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { return (__m128i)__builtin_ia32_gather3div4si( (__v4si)__v1_old, __addr, (__v2di)__index, __mask, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mmask_i64gather_epi64(__m256i __v1_old, __mmask8 __mask, - __m256i __index, void const *__addr, - int __scale) { +__funline __m256i _mm256_mmask_i64gather_epi64(__m256i __v1_old, __mmask8 __mask, + __m256i __index, + void const *__addr, int __scale) { return (__m256i)__builtin_ia32_gather3div4di( (__v4di)__v1_old, __addr, (__v4di)__index, __mask, __scale); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mmask_i64gather_epi64(__m128i __v1_old, __mmask8 __mask, - __m128i __index, void const *__addr, - int __scale) { +__funline __m128i _mm_mmask_i64gather_epi64(__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) { return (__m128i)__builtin_ia32_gather3div2di( (__v2di)__v1_old, __addr, (__v2di)__index, __mask, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_i32scatter_ps(void *__addr, __m256i __index, __m256 __v1, - const int __scale) { +__funline void _mm256_i32scatter_ps(void *__addr, __m256i __index, __m256 __v1, + const int __scale) { __builtin_ia32_scattersiv8sf(__addr, (__mmask8)0xFF, (__v8si)__index, (__v8sf)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i32scatter_ps(void *__addr, __mmask8 __mask, __m256i __index, - __m256 __v1, const int __scale) { +__funline void _mm256_mask_i32scatter_ps(void *__addr, __mmask8 __mask, + __m256i __index, __m256 __v1, + const int __scale) { __builtin_ia32_scattersiv8sf(__addr, __mask, (__v8si)__index, (__v8sf)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i32scatter_ps(void *__addr, __m128i __index, __m128 __v1, - const int __scale) { +__funline void _mm_i32scatter_ps(void *__addr, __m128i __index, __m128 __v1, + const int __scale) { __builtin_ia32_scattersiv4sf(__addr, (__mmask8)0xFF, (__v4si)__index, (__v4sf)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i32scatter_ps(void *__addr, __mmask8 __mask, __m128i __index, - __m128 __v1, const int __scale) { +__funline void _mm_mask_i32scatter_ps(void *__addr, __mmask8 __mask, + __m128i __index, __m128 __v1, + const int __scale) { __builtin_ia32_scattersiv4sf(__addr, __mask, (__v4si)__index, (__v4sf)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_i32scatter_pd(void *__addr, __m128i __index, __m256d __v1, - const int __scale) { +__funline void _mm256_i32scatter_pd(void *__addr, __m128i __index, __m256d __v1, + const int __scale) { __builtin_ia32_scattersiv4df(__addr, (__mmask8)0xFF, (__v4si)__index, (__v4df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i32scatter_pd(void *__addr, __mmask8 __mask, __m128i __index, - __m256d __v1, const int __scale) { +__funline void _mm256_mask_i32scatter_pd(void *__addr, __mmask8 __mask, + __m128i __index, __m256d __v1, + const int __scale) { __builtin_ia32_scattersiv4df(__addr, __mask, (__v4si)__index, (__v4df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i32scatter_pd(void *__addr, __m128i __index, __m128d __v1, - const int __scale) { +__funline void _mm_i32scatter_pd(void *__addr, __m128i __index, __m128d __v1, + const int __scale) { __builtin_ia32_scattersiv2df(__addr, (__mmask8)0xFF, (__v4si)__index, (__v2df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i32scatter_pd(void *__addr, __mmask8 __mask, __m128i __index, - __m128d __v1, const int __scale) { +__funline void _mm_mask_i32scatter_pd(void *__addr, __mmask8 __mask, + __m128i __index, __m128d __v1, + const int __scale) { __builtin_ia32_scattersiv2df(__addr, __mask, (__v4si)__index, (__v2df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_i64scatter_ps(void *__addr, __m256i __index, __m128 __v1, - const int __scale) { +__funline void _mm256_i64scatter_ps(void *__addr, __m256i __index, __m128 __v1, + const int __scale) { __builtin_ia32_scatterdiv8sf(__addr, (__mmask8)0xFF, (__v4di)__index, (__v4sf)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i64scatter_ps(void *__addr, __mmask8 __mask, __m256i __index, - __m128 __v1, const int __scale) { +__funline void _mm256_mask_i64scatter_ps(void *__addr, __mmask8 __mask, + __m256i __index, __m128 __v1, + const int __scale) { __builtin_ia32_scatterdiv8sf(__addr, __mask, (__v4di)__index, (__v4sf)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i64scatter_ps(void *__addr, __m128i __index, __m128 __v1, - const int __scale) { +__funline void _mm_i64scatter_ps(void *__addr, __m128i __index, __m128 __v1, + const int __scale) { __builtin_ia32_scatterdiv4sf(__addr, (__mmask8)0xFF, (__v2di)__index, (__v4sf)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i64scatter_ps(void *__addr, __mmask8 __mask, __m128i __index, - __m128 __v1, const int __scale) { +__funline void _mm_mask_i64scatter_ps(void *__addr, __mmask8 __mask, + __m128i __index, __m128 __v1, + const int __scale) { __builtin_ia32_scatterdiv4sf(__addr, __mask, (__v2di)__index, (__v4sf)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_i64scatter_pd(void *__addr, __m256i __index, __m256d __v1, - const int __scale) { +__funline void _mm256_i64scatter_pd(void *__addr, __m256i __index, __m256d __v1, + const int __scale) { __builtin_ia32_scatterdiv4df(__addr, (__mmask8)0xFF, (__v4di)__index, (__v4df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i64scatter_pd(void *__addr, __mmask8 __mask, __m256i __index, - __m256d __v1, const int __scale) { +__funline void _mm256_mask_i64scatter_pd(void *__addr, __mmask8 __mask, + __m256i __index, __m256d __v1, + const int __scale) { __builtin_ia32_scatterdiv4df(__addr, __mask, (__v4di)__index, (__v4df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i64scatter_pd(void *__addr, __m128i __index, __m128d __v1, - const int __scale) { +__funline void _mm_i64scatter_pd(void *__addr, __m128i __index, __m128d __v1, + const int __scale) { __builtin_ia32_scatterdiv2df(__addr, (__mmask8)0xFF, (__v2di)__index, (__v2df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i64scatter_pd(void *__addr, __mmask8 __mask, __m128i __index, - __m128d __v1, const int __scale) { +__funline void _mm_mask_i64scatter_pd(void *__addr, __mmask8 __mask, + __m128i __index, __m128d __v1, + const int __scale) { __builtin_ia32_scatterdiv2df(__addr, __mask, (__v2di)__index, (__v2df)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_i32scatter_epi32(void *__addr, __m256i __index, __m256i __v1, - const int __scale) { +__funline void _mm256_i32scatter_epi32(void *__addr, __m256i __index, + __m256i __v1, const int __scale) { __builtin_ia32_scattersiv8si(__addr, (__mmask8)0xFF, (__v8si)__index, (__v8si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i32scatter_epi32(void *__addr, __mmask8 __mask, __m256i __index, - __m256i __v1, const int __scale) { +__funline void _mm256_mask_i32scatter_epi32(void *__addr, __mmask8 __mask, + __m256i __index, __m256i __v1, + const int __scale) { __builtin_ia32_scattersiv8si(__addr, __mask, (__v8si)__index, (__v8si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i32scatter_epi32(void *__addr, __m128i __index, __m128i __v1, - const int __scale) { +__funline void _mm_i32scatter_epi32(void *__addr, __m128i __index, __m128i __v1, + const int __scale) { __builtin_ia32_scattersiv4si(__addr, (__mmask8)0xFF, (__v4si)__index, (__v4si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i32scatter_epi32(void *__addr, __mmask8 __mask, __m128i __index, - __m128i __v1, const int __scale) { +__funline void _mm_mask_i32scatter_epi32(void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) { __builtin_ia32_scattersiv4si(__addr, __mask, (__v4si)__index, (__v4si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_i32scatter_epi64(void *__addr, __m128i __index, __m256i __v1, - const int __scale) { +__funline void _mm256_i32scatter_epi64(void *__addr, __m128i __index, + __m256i __v1, const int __scale) { __builtin_ia32_scattersiv4di(__addr, (__mmask8)0xFF, (__v4si)__index, (__v4di)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, __m128i __index, - __m256i __v1, const int __scale) { +__funline void _mm256_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, + __m128i __index, __m256i __v1, + const int __scale) { __builtin_ia32_scattersiv4di(__addr, __mask, (__v4si)__index, (__v4di)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i32scatter_epi64(void *__addr, __m128i __index, __m128i __v1, - const int __scale) { +__funline void _mm_i32scatter_epi64(void *__addr, __m128i __index, __m128i __v1, + const int __scale) { __builtin_ia32_scattersiv2di(__addr, (__mmask8)0xFF, (__v4si)__index, (__v2di)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, __m128i __index, - __m128i __v1, const int __scale) { +__funline void _mm_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) { __builtin_ia32_scattersiv2di(__addr, __mask, (__v4si)__index, (__v2di)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_i64scatter_epi32(void *__addr, __m256i __index, __m128i __v1, - const int __scale) { +__funline void _mm256_i64scatter_epi32(void *__addr, __m256i __index, + __m128i __v1, const int __scale) { __builtin_ia32_scatterdiv8si(__addr, (__mmask8)0xFF, (__v4di)__index, (__v4si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m256i __index, - __m128i __v1, const int __scale) { +__funline void _mm256_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, + __m256i __index, __m128i __v1, + const int __scale) { __builtin_ia32_scatterdiv8si(__addr, __mask, (__v4di)__index, (__v4si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i64scatter_epi32(void *__addr, __m128i __index, __m128i __v1, - const int __scale) { +__funline void _mm_i64scatter_epi32(void *__addr, __m128i __index, __m128i __v1, + const int __scale) { __builtin_ia32_scatterdiv4si(__addr, (__mmask8)0xFF, (__v2di)__index, (__v4si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m128i __index, - __m128i __v1, const int __scale) { +__funline void _mm_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) { __builtin_ia32_scatterdiv4si(__addr, __mask, (__v2di)__index, (__v4si)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_i64scatter_epi64(void *__addr, __m256i __index, __m256i __v1, - const int __scale) { +__funline void _mm256_i64scatter_epi64(void *__addr, __m256i __index, + __m256i __v1, const int __scale) { __builtin_ia32_scatterdiv4di(__addr, (__mmask8)0xFF, (__v4di)__index, (__v4di)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, __m256i __index, - __m256i __v1, const int __scale) { +__funline void _mm256_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, + __m256i __index, __m256i __v1, + const int __scale) { __builtin_ia32_scatterdiv4di(__addr, __mask, (__v4di)__index, (__v4di)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_i64scatter_epi64(void *__addr, __m128i __index, __m128i __v1, - const int __scale) { +__funline void _mm_i64scatter_epi64(void *__addr, __m128i __index, __m128i __v1, + const int __scale) { __builtin_ia32_scatterdiv2di(__addr, (__mmask8)0xFF, (__v2di)__index, (__v2di)__v1, __scale); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, __m128i __index, - __m128i __v1, const int __scale) { +__funline void _mm_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) { __builtin_ia32_scatterdiv2di(__addr, __mask, (__v2di)__index, (__v2di)__v1, __scale); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, __m256i __A, - _MM_PERM_ENUM __mask) { +__funline __m256i _mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, + __m256i __A, _MM_PERM_ENUM __mask) { return (__m256i)__builtin_ia32_pshufd256_mask((__v8si)__A, __mask, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A, _MM_PERM_ENUM __mask) { +__funline __m256i _mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A, + _MM_PERM_ENUM __mask) { return (__m256i)__builtin_ia32_pshufd256_mask( (__v8si)__A, __mask, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_shuffle_epi32(__m128i __W, __mmask8 __U, __m128i __A, - _MM_PERM_ENUM __mask) { +__funline __m128i _mm_mask_shuffle_epi32(__m128i __W, __mmask8 __U, __m128i __A, + _MM_PERM_ENUM __mask) { return (__m128i)__builtin_ia32_pshufd128_mask((__v4si)__A, __mask, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A, _MM_PERM_ENUM __mask) { +__funline __m128i _mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A, + _MM_PERM_ENUM __mask) { return (__m128i)__builtin_ia32_pshufd128_mask( (__v4si)__A, __mask, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rol_epi32(__m256i __A, const int __B) { +__funline __m256i _mm256_rol_epi32(__m256i __A, const int __B) { return (__m256i)__builtin_ia32_prold256_mask( (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_rol_epi32(__m256i __W, __mmask8 __U, __m256i __A, const int __B) { +__funline __m256i _mm256_mask_rol_epi32(__m256i __W, __mmask8 __U, __m256i __A, + const int __B) { return (__m256i)__builtin_ia32_prold256_mask((__v8si)__A, __B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_rol_epi32(__mmask8 __U, __m256i __A, const int __B) { +__funline __m256i _mm256_maskz_rol_epi32(__mmask8 __U, __m256i __A, + const int __B) { return (__m256i)__builtin_ia32_prold256_mask( (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rol_epi32(__m128i __A, const int __B) { +__funline __m128i _mm_rol_epi32(__m128i __A, const int __B) { return (__m128i)__builtin_ia32_prold128_mask( (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rol_epi32(__m128i __W, __mmask8 __U, __m128i __A, const int __B) { +__funline __m128i _mm_mask_rol_epi32(__m128i __W, __mmask8 __U, __m128i __A, + const int __B) { return (__m128i)__builtin_ia32_prold128_mask((__v4si)__A, __B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rol_epi32(__mmask8 __U, __m128i __A, const int __B) { +__funline __m128i _mm_maskz_rol_epi32(__mmask8 __U, __m128i __A, const int __B) { return (__m128i)__builtin_ia32_prold128_mask( (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_ror_epi32(__m256i __A, const int __B) { +__funline __m256i _mm256_ror_epi32(__m256i __A, const int __B) { return (__m256i)__builtin_ia32_prord256_mask( (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_ror_epi32(__m256i __W, __mmask8 __U, __m256i __A, const int __B) { +__funline __m256i _mm256_mask_ror_epi32(__m256i __W, __mmask8 __U, __m256i __A, + const int __B) { return (__m256i)__builtin_ia32_prord256_mask((__v8si)__A, __B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_ror_epi32(__mmask8 __U, __m256i __A, const int __B) { +__funline __m256i _mm256_maskz_ror_epi32(__mmask8 __U, __m256i __A, + const int __B) { return (__m256i)__builtin_ia32_prord256_mask( (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ror_epi32(__m128i __A, const int __B) { +__funline __m128i _mm_ror_epi32(__m128i __A, const int __B) { return (__m128i)__builtin_ia32_prord128_mask( (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_ror_epi32(__m128i __W, __mmask8 __U, __m128i __A, const int __B) { +__funline __m128i _mm_mask_ror_epi32(__m128i __W, __mmask8 __U, __m128i __A, + const int __B) { return (__m128i)__builtin_ia32_prord128_mask((__v4si)__A, __B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_ror_epi32(__mmask8 __U, __m128i __A, const int __B) { +__funline __m128i _mm_maskz_ror_epi32(__mmask8 __U, __m128i __A, const int __B) { return (__m128i)__builtin_ia32_prord128_mask( (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rol_epi64(__m256i __A, const int __B) { +__funline __m256i _mm256_rol_epi64(__m256i __A, const int __B) { return (__m256i)__builtin_ia32_prolq256_mask( (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_rol_epi64(__m256i __W, __mmask8 __U, __m256i __A, const int __B) { +__funline __m256i _mm256_mask_rol_epi64(__m256i __W, __mmask8 __U, __m256i __A, + const int __B) { return (__m256i)__builtin_ia32_prolq256_mask((__v4di)__A, __B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_rol_epi64(__mmask8 __U, __m256i __A, const int __B) { +__funline __m256i _mm256_maskz_rol_epi64(__mmask8 __U, __m256i __A, + const int __B) { return (__m256i)__builtin_ia32_prolq256_mask( (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rol_epi64(__m128i __A, const int __B) { +__funline __m128i _mm_rol_epi64(__m128i __A, const int __B) { return (__m128i)__builtin_ia32_prolq128_mask( (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_rol_epi64(__m128i __W, __mmask8 __U, __m128i __A, const int __B) { +__funline __m128i _mm_mask_rol_epi64(__m128i __W, __mmask8 __U, __m128i __A, + const int __B) { return (__m128i)__builtin_ia32_prolq128_mask((__v2di)__A, __B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_rol_epi64(__mmask8 __U, __m128i __A, const int __B) { +__funline __m128i _mm_maskz_rol_epi64(__mmask8 __U, __m128i __A, const int __B) { return (__m128i)__builtin_ia32_prolq128_mask( (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_ror_epi64(__m256i __A, const int __B) { +__funline __m256i _mm256_ror_epi64(__m256i __A, const int __B) { return (__m256i)__builtin_ia32_prorq256_mask( (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_ror_epi64(__m256i __W, __mmask8 __U, __m256i __A, const int __B) { +__funline __m256i _mm256_mask_ror_epi64(__m256i __W, __mmask8 __U, __m256i __A, + const int __B) { return (__m256i)__builtin_ia32_prorq256_mask((__v4di)__A, __B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_ror_epi64(__mmask8 __U, __m256i __A, const int __B) { +__funline __m256i _mm256_maskz_ror_epi64(__mmask8 __U, __m256i __A, + const int __B) { return (__m256i)__builtin_ia32_prorq256_mask( (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ror_epi64(__m128i __A, const int __B) { +__funline __m128i _mm_ror_epi64(__m128i __A, const int __B) { return (__m128i)__builtin_ia32_prorq128_mask( (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_ror_epi64(__m128i __W, __mmask8 __U, __m128i __A, const int __B) { +__funline __m128i _mm_mask_ror_epi64(__m128i __W, __mmask8 __U, __m128i __A, + const int __B) { return (__m128i)__builtin_ia32_prorq128_mask((__v2di)__A, __B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_ror_epi64(__mmask8 __U, __m128i __A, const int __B) { +__funline __m128i _mm_maskz_ror_epi64(__mmask8 __U, __m128i __A, const int __B) { return (__m128i)__builtin_ia32_prorq128_mask( (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_alignr_epi32(__m128i __A, __m128i __B, const int __imm) { +__funline __m128i _mm_alignr_epi32(__m128i __A, __m128i __B, const int __imm) { return (__m128i)__builtin_ia32_alignd128_mask((__v4si)__A, (__v4si)__B, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_alignr_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B, - const int __imm) { +__funline __m128i _mm_mask_alignr_epi32(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) { return (__m128i)__builtin_ia32_alignd128_mask((__v4si)__A, (__v4si)__B, __imm, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_alignr_epi32(__mmask8 __U, __m128i __A, __m128i __B, - const int __imm) { +__funline __m128i _mm_maskz_alignr_epi32(__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) { return (__m128i)__builtin_ia32_alignd128_mask((__v4si)__A, (__v4si)__B, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_alignr_epi64(__m128i __A, __m128i __B, const int __imm) { +__funline __m128i _mm_alignr_epi64(__m128i __A, __m128i __B, const int __imm) { return (__m128i)__builtin_ia32_alignq128_mask((__v2di)__A, (__v2di)__B, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_alignr_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B, - const int __imm) { +__funline __m128i _mm_mask_alignr_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) { return (__m128i)__builtin_ia32_alignq128_mask((__v2di)__A, (__v2di)__B, __imm, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_alignr_epi64(__mmask8 __U, __m128i __A, __m128i __B, - const int __imm) { +__funline __m128i _mm_maskz_alignr_epi64(__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) { return (__m128i)__builtin_ia32_alignq128_mask((__v2di)__A, (__v2di)__B, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_alignr_epi32(__m256i __A, __m256i __B, const int __imm) { +__funline __m256i _mm256_alignr_epi32(__m256i __A, __m256i __B, const int __imm) { return (__m256i)__builtin_ia32_alignd256_mask((__v8si)__A, (__v8si)__B, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_alignr_epi32(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B, const int __imm) { +__funline __m256i _mm256_mask_alignr_epi32(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) { return (__m256i)__builtin_ia32_alignd256_mask((__v8si)__A, (__v8si)__B, __imm, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_alignr_epi32(__mmask8 __U, __m256i __A, __m256i __B, - const int __imm) { +__funline __m256i _mm256_maskz_alignr_epi32(__mmask8 __U, __m256i __A, + __m256i __B, const int __imm) { return (__m256i)__builtin_ia32_alignd256_mask((__v8si)__A, (__v8si)__B, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_alignr_epi64(__m256i __A, __m256i __B, const int __imm) { +__funline __m256i _mm256_alignr_epi64(__m256i __A, __m256i __B, const int __imm) { return (__m256i)__builtin_ia32_alignq256_mask((__v4di)__A, (__v4di)__B, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_alignr_epi64(__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B, const int __imm) { +__funline __m256i _mm256_mask_alignr_epi64(__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) { return (__m256i)__builtin_ia32_alignq256_mask((__v4di)__A, (__v4di)__B, __imm, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_alignr_epi64(__mmask8 __U, __m256i __A, __m256i __B, - const int __imm) { +__funline __m256i _mm256_maskz_alignr_epi64(__mmask8 __U, __m256i __A, + __m256i __B, const int __imm) { return (__m256i)__builtin_ia32_alignq256_mask((__v4di)__A, (__v4di)__B, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m128 __A, const int __I) { +__funline __m128i _mm_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m128 __A, + const int __I) { return (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)__A, __I, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_cvtps_ph(__mmask8 __U, __m128 __A, const int __I) { +__funline __m128i _mm_maskz_cvtps_ph(__mmask8 __U, __m128 __A, const int __I) { return (__m128i)__builtin_ia32_vcvtps2ph_mask( (__v4sf)__A, __I, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m256 __A, const int __I) { +__funline __m128i _mm256_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m256 __A, + const int __I) { return (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)__A, __I, (__v8hi)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_cvtps_ph(__mmask8 __U, __m256 __A, const int __I) { +__funline __m128i _mm256_maskz_cvtps_ph(__mmask8 __U, __m256 __A, const int __I) { return (__m128i)__builtin_ia32_vcvtps2ph256_mask( (__v8sf)__A, __I, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, - const int __imm) { +__funline __m256i _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psradi256_mask((__v8si)__A, __imm, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, const int __imm) { +__funline __m256i _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psradi256_mask( (__v8si)__A, __imm, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psradi128_mask((__v4si)__A, __imm, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psradi128_mask( (__v4si)__A, __imm, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_srai_epi64(__m256i __A, const int __imm) { +__funline __m256i _mm256_srai_epi64(__m256i __A, const int __imm) { return (__m256i)__builtin_ia32_psraqi256_mask( (__v4di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)-1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, - const int __imm) { +__funline __m256i _mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psraqi256_mask((__v4di)__A, __imm, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, const int __imm) { +__funline __m256i _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, + const int __imm) { return (__m256i)__builtin_ia32_psraqi256_mask( (__v4di)__A, __imm, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srai_epi64(__m128i __A, const int __imm) { +__funline __m128i _mm_srai_epi64(__m128i __A, const int __imm) { return (__m128i)__builtin_ia32_psraqi128_mask( (__v2di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)-1); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psraqi128_mask((__v2di)__A, __imm, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, const int __imm) { +__funline __m128i _mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, + const int __imm) { return (__m128i)__builtin_ia32_psraqi128_mask( (__v2di)__A, __imm, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) { +__funline __m128i _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, + int __B) { return (__m128i)__builtin_ia32_pslldi128_mask((__v4si)__A, __B, (__v4si)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B) { +__funline __m128i _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_pslldi128_mask( (__v4si)__A, __B, (__v4si)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B) { +__funline __m128i _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, + int __B) { return (__m128i)__builtin_ia32_psllqi128_mask((__v2di)__A, __B, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B) { +__funline __m128i _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_psllqi128_mask( (__v2di)__A, __B, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) { +__funline __m256i _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, + int __B) { return (__m256i)__builtin_ia32_pslldi256_mask((__v8si)__A, __B, (__v8si)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B) { +__funline __m256i _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_pslldi256_mask( (__v8si)__A, __B, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B) { +__funline __m256i _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, + int __B) { return (__m256i)__builtin_ia32_psllqi256_mask((__v4di)__A, __B, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B) { +__funline __m256i _mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_psllqi256_mask( (__v4di)__A, __B, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_permutex_pd(__m256d __W, __mmask8 __U, __m256d __X, - const int __imm) { +__funline __m256d _mm256_mask_permutex_pd(__m256d __W, __mmask8 __U, __m256d __X, + const int __imm) { return (__m256d)__builtin_ia32_permdf256_mask((__v4df)__X, __imm, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permutex_pd(__mmask8 __U, __m256d __X, const int __imm) { +__funline __m256d _mm256_maskz_permutex_pd(__mmask8 __U, __m256d __X, + const int __imm) { return (__m256d)__builtin_ia32_permdf256_mask( (__v4df)__X, __imm, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X, const int __C) { +__funline __m256d _mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X, + const int __C) { return (__m256d)__builtin_ia32_vpermilpd256_mask((__v4df)__X, __C, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permute_pd(__mmask8 __U, __m256d __X, const int __C) { +__funline __m256d _mm256_maskz_permute_pd(__mmask8 __U, __m256d __X, + const int __C) { return (__m256d)__builtin_ia32_vpermilpd256_mask( (__v4df)__X, __C, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_permute_pd(__m128d __W, __mmask8 __U, __m128d __X, const int __C) { +__funline __m128d _mm_mask_permute_pd(__m128d __W, __mmask8 __U, __m128d __X, + const int __C) { return (__m128d)__builtin_ia32_vpermilpd_mask((__v2df)__X, __C, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_permute_pd(__mmask8 __U, __m128d __X, const int __C) { +__funline __m128d _mm_maskz_permute_pd(__mmask8 __U, __m128d __X, const int __C) { return (__m128d)__builtin_ia32_vpermilpd_mask( (__v2df)__X, __C, (__v2df)_mm_setzero_pd(), (__mmask8)__U); } -extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X, const int __C) { +__funline __m256 _mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X, + const int __C) { return (__m256)__builtin_ia32_vpermilps256_mask((__v8sf)__X, __C, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_permute_ps(__mmask8 __U, __m256 __X, const int __C) { +__funline __m256 _mm256_maskz_permute_ps(__mmask8 __U, __m256 __X, + const int __C) { return (__m256)__builtin_ia32_vpermilps256_mask( (__v8sf)__X, __C, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X, const int __C) { +__funline __m128 _mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X, + const int __C) { return (__m128)__builtin_ia32_vpermilps_mask((__v4sf)__X, __C, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_permute_ps(__mmask8 __U, __m128 __X, const int __C) { +__funline __m128 _mm_maskz_permute_ps(__mmask8 __U, __m128 __X, const int __C) { return (__m128)__builtin_ia32_vpermilps_mask( (__v4sf)__X, __C, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_blend_pd(__mmask8 __U, __m256d __A, __m256d __W) { +__funline __m256d _mm256_mask_blend_pd(__mmask8 __U, __m256d __A, __m256d __W) { return (__m256d)__builtin_ia32_blendmpd_256_mask((__v4df)__A, (__v4df)__W, (__mmask8)__U); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_blend_ps(__mmask8 __U, __m256 __A, __m256 __W) { +__funline __m256 _mm256_mask_blend_ps(__mmask8 __U, __m256 __A, __m256 __W) { return (__m256)__builtin_ia32_blendmps_256_mask((__v8sf)__A, (__v8sf)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_blend_epi64(__mmask8 __U, __m256i __A, __m256i __W) { +__funline __m256i _mm256_mask_blend_epi64(__mmask8 __U, __m256i __A, + __m256i __W) { return (__m256i)__builtin_ia32_blendmq_256_mask((__v4di)__A, (__v4di)__W, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_blend_epi32(__mmask8 __U, __m256i __A, __m256i __W) { +__funline __m256i _mm256_mask_blend_epi32(__mmask8 __U, __m256i __A, + __m256i __W) { return (__m256i)__builtin_ia32_blendmd_256_mask((__v8si)__A, (__v8si)__W, (__mmask8)__U); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_blend_pd(__mmask8 __U, __m128d __A, __m128d __W) { +__funline __m128d _mm_mask_blend_pd(__mmask8 __U, __m128d __A, __m128d __W) { return (__m128d)__builtin_ia32_blendmpd_128_mask((__v2df)__A, (__v2df)__W, (__mmask8)__U); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_blend_ps(__mmask8 __U, __m128 __A, __m128 __W) { +__funline __m128 _mm_mask_blend_ps(__mmask8 __U, __m128 __A, __m128 __W) { return (__m128)__builtin_ia32_blendmps_128_mask((__v4sf)__A, (__v4sf)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_blend_epi64(__mmask8 __U, __m128i __A, __m128i __W) { +__funline __m128i _mm_mask_blend_epi64(__mmask8 __U, __m128i __A, __m128i __W) { return (__m128i)__builtin_ia32_blendmq_128_mask((__v2di)__A, (__v2di)__W, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) { +__funline __m128i _mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) { return (__m128i)__builtin_ia32_blendmd_128_mask((__v4si)__A, (__v4si)__W, (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_epi64_mask(__m256i __X, __m256i __Y, const int __P) { +__funline __mmask8 _mm256_cmp_epi64_mask(__m256i __X, __m256i __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_epi32_mask(__m256i __X, __m256i __Y, const int __P) { +__funline __mmask8 _mm256_cmp_epi32_mask(__m256i __X, __m256i __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_epu64_mask(__m256i __X, __m256i __Y, const int __P) { +__funline __mmask8 _mm256_cmp_epu64_mask(__m256i __X, __m256i __Y, + const int __P) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_epu32_mask(__m256i __X, __m256i __Y, const int __P) { +__funline __mmask8 _mm256_cmp_epu32_mask(__m256i __X, __m256i __Y, + const int __P) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_pd_mask(__m256d __X, __m256d __Y, const int __P) { +__funline __mmask8 _mm256_cmp_pd_mask(__m256d __X, __m256d __Y, const int __P) { return (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)__X, (__v4df)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_ps_mask(__m256 __X, __m256 __Y, const int __P) { +__funline __mmask8 _mm256_cmp_ps_mask(__m256 __X, __m256 __Y, const int __P) { return (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)__X, (__v8sf)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmp_epi64_mask(__mmask8 __U, __m256i __X, __m256i __Y, - const int __P) { +__funline __mmask8 _mm256_mask_cmp_epi64_mask(__mmask8 __U, __m256i __X, + __m256i __Y, const int __P) { return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__X, (__v4di)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmp_epi32_mask(__mmask8 __U, __m256i __X, __m256i __Y, - const int __P) { +__funline __mmask8 _mm256_mask_cmp_epi32_mask(__mmask8 __U, __m256i __X, + __m256i __Y, const int __P) { return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__X, (__v8si)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmp_epu64_mask(__mmask8 __U, __m256i __X, __m256i __Y, - const int __P) { +__funline __mmask8 _mm256_mask_cmp_epu64_mask(__mmask8 __U, __m256i __X, + __m256i __Y, const int __P) { return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__X, (__v4di)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_cmp_epu32_mask(__mmask8 __U, __m256i __X, __m256i __Y, - const int __P) { +__funline __mmask8 _mm256_mask_cmp_epu32_mask(__mmask8 __U, __m256i __X, + __m256i __Y, const int __P) { return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__X, (__v8si)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_cmp_pd_mask(__mmask8 __U, __m256d __X, __m256d __Y, const int __P) { +__funline __mmask8 _mm256_mask_cmp_pd_mask(__mmask8 __U, __m256d __X, __m256d __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)__X, (__v4df)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_cmp_ps_mask(__mmask8 __U, __m256 __X, __m256 __Y, const int __P) { +__funline __mmask8 _mm256_mask_cmp_ps_mask(__mmask8 __U, __m256 __X, __m256 __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)__X, (__v8sf)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_epi64_mask(__m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_cmp_epi64_mask(__m128i __X, __m128i __Y, const int __P) { return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_epi32_mask(__m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_cmp_epi32_mask(__m128i __X, __m128i __Y, const int __P) { return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_epu64_mask(__m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_cmp_epu64_mask(__m128i __X, __m128i __Y, const int __P) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_epu32_mask(__m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_cmp_epu32_mask(__m128i __X, __m128i __Y, const int __P) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_pd_mask(__m128d __X, __m128d __Y, const int __P) { +__funline __mmask8 _mm_cmp_pd_mask(__m128d __X, __m128d __Y, const int __P) { return (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)__X, (__v2df)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_ps_mask(__m128 __X, __m128 __Y, const int __P) { +__funline __mmask8 _mm_cmp_ps_mask(__m128 __X, __m128 __Y, const int __P) { return (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)__X, (__v4sf)__Y, __P, (__mmask8)-1); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_cmp_epi64_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_mask_cmp_epi64_mask(__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__X, (__v2di)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_cmp_epi32_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_mask_cmp_epi32_mask(__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__X, (__v4si)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_cmp_epu64_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_mask_cmp_epu64_mask(__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) { return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__X, (__v2di)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_cmp_epu32_mask(__mmask8 __U, __m128i __X, __m128i __Y, const int __P) { +__funline __mmask8 _mm_mask_cmp_epu32_mask(__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) { return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__X, (__v4si)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_cmp_pd_mask(__mmask8 __U, __m128d __X, __m128d __Y, const int __P) { +__funline __mmask8 _mm_mask_cmp_pd_mask(__mmask8 __U, __m128d __X, __m128d __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)__X, (__v2df)__Y, __P, (__mmask8)__U); } -extern __inline __mmask8 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_cmp_ps_mask(__mmask8 __U, __m128 __X, __m128 __Y, const int __P) { +__funline __mmask8 _mm_mask_cmp_ps_mask(__mmask8 __U, __m128 __X, __m128 __Y, + const int __P) { return (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)__X, (__v4sf)__Y, __P, (__mmask8)__U); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutex_pd(__m256d __X, const int __M) { +__funline __m256d _mm256_permutex_pd(__m256d __X, const int __M) { return (__m256d)__builtin_ia32_permdf256_mask( (__v4df)__X, __M, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); } diff --git a/third_party/intel/avx512vnniintrin.internal.h b/third_party/intel/avx512vnniintrin.internal.h index 635b03afe..3706fda4f 100644 --- a/third_party/intel/avx512vnniintrin.internal.h +++ b/third_party/intel/avx512vnniintrin.internal.h @@ -11,92 +11,70 @@ #define __DISABLE_AVX512VNNI__ #endif /* __AVX512VNNI__ */ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_dpbusd_epi32(__m512i __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_dpbusd_epi32(__m512i __A, __m512i __B, __m512i __C) { return (__m512i)__builtin_ia32_vpdpbusd_v16si((__v16si)__A, (__v16si)__B, (__v16si)__C); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_dpbusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_mask_dpbusd_epi32(__m512i __A, __mmask16 __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpdpbusd_v16si_mask( (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_dpbusd_epi32(__mmask16 __A, __m512i __B, __m512i __C, - __m512i __D) { +__funline __m512i _mm512_maskz_dpbusd_epi32(__mmask16 __A, __m512i __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpdpbusd_v16si_maskz( (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_dpbusds_epi32(__m512i __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_dpbusds_epi32(__m512i __A, __m512i __B, __m512i __C) { return (__m512i)__builtin_ia32_vpdpbusds_v16si((__v16si)__A, (__v16si)__B, (__v16si)__C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_dpbusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, - __m512i __D) { +__funline __m512i _mm512_mask_dpbusds_epi32(__m512i __A, __mmask16 __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpdpbusds_v16si_mask( (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_dpbusds_epi32(__mmask16 __A, __m512i __B, __m512i __C, - __m512i __D) { +__funline __m512i _mm512_maskz_dpbusds_epi32(__mmask16 __A, __m512i __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpdpbusds_v16si_maskz( (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_dpwssd_epi32(__m512i __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_dpwssd_epi32(__m512i __A, __m512i __B, __m512i __C) { return (__m512i)__builtin_ia32_vpdpwssd_v16si((__v16si)__A, (__v16si)__B, (__v16si)__C); } -extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm512_mask_dpwssd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) { +__funline __m512i _mm512_mask_dpwssd_epi32(__m512i __A, __mmask16 __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpdpwssd_v16si_mask( (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_dpwssd_epi32(__mmask16 __A, __m512i __B, __m512i __C, - __m512i __D) { +__funline __m512i _mm512_maskz_dpwssd_epi32(__mmask16 __A, __m512i __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpdpwssd_v16si_maskz( (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_dpwssds_epi32(__m512i __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_dpwssds_epi32(__m512i __A, __m512i __B, __m512i __C) { return (__m512i)__builtin_ia32_vpdpwssds_v16si((__v16si)__A, (__v16si)__B, (__v16si)__C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_dpwssds_epi32(__m512i __A, __mmask16 __B, __m512i __C, - __m512i __D) { +__funline __m512i _mm512_mask_dpwssds_epi32(__m512i __A, __mmask16 __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpdpwssds_v16si_mask( (__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_dpwssds_epi32(__mmask16 __A, __m512i __B, __m512i __C, - __m512i __D) { +__funline __m512i _mm512_maskz_dpwssds_epi32(__mmask16 __A, __m512i __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vpdpwssds_v16si_maskz( (__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A); } diff --git a/third_party/intel/avx512vnnivlintrin.internal.h b/third_party/intel/avx512vnnivlintrin.internal.h index 48fb03c5e..2c2750152 100644 --- a/third_party/intel/avx512vnnivlintrin.internal.h +++ b/third_party/intel/avx512vnnivlintrin.internal.h @@ -12,172 +12,138 @@ #define __DISABLE_AVX512VNNIVL__ #endif /* __AVX512VNNIVL__ */ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_dpbusd_epi32(__m256i __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_dpbusd_epi32(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)__builtin_ia32_vpdpbusd_v8si((__v8si)__A, (__v8si)__B, (__v8si)__C); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_dpbusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_mask_dpbusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpdpbusd_v8si_mask((__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_dpbusd_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_maskz_dpbusd_epi32(__mmask8 __A, __m256i __B, + __m256i __C, __m256i __D) { return (__m256i)__builtin_ia32_vpdpbusd_v8si_maskz( (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_dpbusd_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_dpbusd_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpdpbusd_v4si((__v4si)__A, (__v4si)__B, (__v4si)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_dpbusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_dpbusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpdpbusd_v4si_mask((__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_dpbusd_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_maskz_dpbusd_epi32(__mmask8 __A, __m128i __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpdpbusd_v4si_maskz( (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_dpbusds_epi32(__m256i __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_dpbusds_epi32(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)__builtin_ia32_vpdpbusds_v8si((__v8si)__A, (__v8si)__B, (__v8si)__C); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_dpbusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_mask_dpbusds_epi32(__m256i __A, __mmask8 __B, + __m256i __C, __m256i __D) { return (__m256i)__builtin_ia32_vpdpbusds_v8si_mask( (__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_dpbusds_epi32(__mmask8 __A, __m256i __B, __m256i __C, - __m256i __D) { +__funline __m256i _mm256_maskz_dpbusds_epi32(__mmask8 __A, __m256i __B, + __m256i __C, __m256i __D) { return (__m256i)__builtin_ia32_vpdpbusds_v8si_maskz( (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_dpbusds_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_dpbusds_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpdpbusds_v4si((__v4si)__A, (__v4si)__B, (__v4si)__C); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_dpbusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_dpbusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpdpbusds_v4si_mask( (__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_dpbusds_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_maskz_dpbusds_epi32(__mmask8 __A, __m128i __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpdpbusds_v4si_maskz( (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_dpwssd_epi32(__m256i __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_dpwssd_epi32(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)__builtin_ia32_vpdpwssd_v8si((__v8si)__A, (__v8si)__B, (__v8si)__C); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_dpwssd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_mask_dpwssd_epi32(__m256i __A, __mmask8 __B, __m256i __C, + __m256i __D) { return (__m256i)__builtin_ia32_vpdpwssd_v8si_mask((__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_maskz_dpwssd_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_maskz_dpwssd_epi32(__mmask8 __A, __m256i __B, + __m256i __C, __m256i __D) { return (__m256i)__builtin_ia32_vpdpwssd_v8si_maskz( (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_dpwssd_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_dpwssd_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpdpwssd_v4si((__v4si)__A, (__v4si)__B, (__v4si)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_dpwssd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_dpwssd_epi32(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpdpwssd_v4si_mask((__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_dpwssd_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_maskz_dpwssd_epi32(__mmask8 __A, __m128i __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpdpwssd_v4si_maskz( (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_dpwssds_epi32(__m256i __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_dpwssds_epi32(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)__builtin_ia32_vpdpwssds_v8si((__v8si)__A, (__v8si)__B, (__v8si)__C); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_mask_dpwssds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) { +__funline __m256i _mm256_mask_dpwssds_epi32(__m256i __A, __mmask8 __B, + __m256i __C, __m256i __D) { return (__m256i)__builtin_ia32_vpdpwssds_v8si_mask( (__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_dpwssds_epi32(__mmask8 __A, __m256i __B, __m256i __C, - __m256i __D) { +__funline __m256i _mm256_maskz_dpwssds_epi32(__mmask8 __A, __m256i __B, + __m256i __C, __m256i __D) { return (__m256i)__builtin_ia32_vpdpwssds_v8si_maskz( (__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_dpwssds_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_dpwssds_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpdpwssds_v4si((__v4si)__A, (__v4si)__B, (__v4si)__C); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_dpwssds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_dpwssds_epi32(__m128i __A, __mmask8 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpdpwssds_v4si_mask( (__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B); } -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_maskz_dpwssds_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_maskz_dpwssds_epi32(__mmask8 __A, __m128i __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vpdpwssds_v4si_maskz( (__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A); } diff --git a/third_party/intel/avx512vpopcntdqintrin.internal.h b/third_party/intel/avx512vpopcntdqintrin.internal.h index cc7bc6e12..f7629f866 100644 --- a/third_party/intel/avx512vpopcntdqintrin.internal.h +++ b/third_party/intel/avx512vpopcntdqintrin.internal.h @@ -12,42 +12,32 @@ #define __DISABLE_AVX512VPOPCNTDQ__ #endif /* __AVX512VPOPCNTDQ__ */ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_popcnt_epi32(__m512i __A) { +__funline __m512i _mm512_popcnt_epi32(__m512i __A) { return (__m512i)__builtin_ia32_vpopcountd_v16si((__v16si)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_popcnt_epi32(__m512i __A, __mmask16 __U, __m512i __B) { +__funline __m512i _mm512_mask_popcnt_epi32(__m512i __A, __mmask16 __U, + __m512i __B) { return (__m512i)__builtin_ia32_vpopcountd_v16si_mask( (__v16si)__A, (__v16si)__B, (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) { +__funline __m512i _mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) { return (__m512i)__builtin_ia32_vpopcountd_v16si_mask( (__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_popcnt_epi64(__m512i __A) { +__funline __m512i _mm512_popcnt_epi64(__m512i __A) { return (__m512i)__builtin_ia32_vpopcountq_v8di((__v8di)__A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_popcnt_epi64(__m512i __A, __mmask8 __U, __m512i __B) { +__funline __m512i _mm512_mask_popcnt_epi64(__m512i __A, __mmask8 __U, + __m512i __B) { return (__m512i)__builtin_ia32_vpopcountq_v8di_mask((__v8di)__A, (__v8di)__B, (__mmask8)__U); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { +__funline __m512i _mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { return (__m512i)__builtin_ia32_vpopcountq_v8di_mask( (__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U); } diff --git a/third_party/intel/avx512vpopcntdqvlintrin.internal.h b/third_party/intel/avx512vpopcntdqvlintrin.internal.h index 4dc14e009..e46c2cf55 100644 --- a/third_party/intel/avx512vpopcntdqvlintrin.internal.h +++ b/third_party/intel/avx512vpopcntdqvlintrin.internal.h @@ -12,82 +12,60 @@ #define __DISABLE_AVX512VPOPCNTDQVL__ #endif /* __AVX512VPOPCNTDQVL__ */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_popcnt_epi32(__m128i __A) { +__funline __m128i _mm_popcnt_epi32(__m128i __A) { return (__m128i)__builtin_ia32_vpopcountd_v4si((__v4si)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_popcnt_epi32(__m128i __A, __mmask16 __U, __m128i __B) { +__funline __m128i _mm_mask_popcnt_epi32(__m128i __A, __mmask16 __U, __m128i __B) { return (__m128i)__builtin_ia32_vpopcountd_v4si_mask((__v4si)__A, (__v4si)__B, (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_popcnt_epi32(__mmask16 __U, __m128i __A) { +__funline __m128i _mm_maskz_popcnt_epi32(__mmask16 __U, __m128i __A) { return (__m128i)__builtin_ia32_vpopcountd_v4si_mask( (__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_popcnt_epi32(__m256i __A) { +__funline __m256i _mm256_popcnt_epi32(__m256i __A) { return (__m256i)__builtin_ia32_vpopcountd_v8si((__v8si)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_popcnt_epi32(__m256i __A, __mmask16 __U, __m256i __B) { +__funline __m256i _mm256_mask_popcnt_epi32(__m256i __A, __mmask16 __U, + __m256i __B) { return (__m256i)__builtin_ia32_vpopcountd_v8si_mask((__v8si)__A, (__v8si)__B, (__mmask16)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_popcnt_epi32(__mmask16 __U, __m256i __A) { +__funline __m256i _mm256_maskz_popcnt_epi32(__mmask16 __U, __m256i __A) { return (__m256i)__builtin_ia32_vpopcountd_v8si_mask( (__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask16)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_popcnt_epi64(__m128i __A) { +__funline __m128i _mm_popcnt_epi64(__m128i __A) { return (__m128i)__builtin_ia32_vpopcountq_v2di((__v2di)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_popcnt_epi64(__m128i __A, __mmask8 __U, __m128i __B) { +__funline __m128i _mm_mask_popcnt_epi64(__m128i __A, __mmask8 __U, __m128i __B) { return (__m128i)__builtin_ia32_vpopcountq_v2di_mask((__v2di)__A, (__v2di)__B, (__mmask8)__U); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { +__funline __m128i _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_vpopcountq_v2di_mask( (__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_popcnt_epi64(__m256i __A) { +__funline __m256i _mm256_popcnt_epi64(__m256i __A) { return (__m256i)__builtin_ia32_vpopcountq_v4di((__v4di)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_popcnt_epi64(__m256i __A, __mmask8 __U, __m256i __B) { +__funline __m256i _mm256_mask_popcnt_epi64(__m256i __A, __mmask8 __U, + __m256i __B) { return (__m256i)__builtin_ia32_vpopcountq_v4di_mask((__v4di)__A, (__v4di)__B, (__mmask8)__U); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { +__funline __m256i _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_vpopcountq_v4di_mask( (__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); } diff --git a/third_party/intel/avxintrin.internal.h b/third_party/intel/avxintrin.internal.h index 93542c8d4..c3e7ca305 100644 --- a/third_party/intel/avxintrin.internal.h +++ b/third_party/intel/avxintrin.internal.h @@ -66,64 +66,44 @@ typedef double __m256d_u #define _CMP_GT_OQ 0x1e #define _CMP_TRUE_US 0x1f -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_add_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_add_pd(__m256d __A, __m256d __B) { return (__m256d)((__v4df)__A + (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_add_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_add_ps(__m256 __A, __m256 __B) { return (__m256)((__v8sf)__A + (__v8sf)__B); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_addsub_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_addsub_pd(__m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_addsubpd256((__v4df)__A, (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_addsub_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_addsub_ps(__m256 __A, __m256 __B) { return (__m256)__builtin_ia32_addsubps256((__v8sf)__A, (__v8sf)__B); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_and_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_and_pd(__m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_andpd256((__v4df)__A, (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_and_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_and_ps(__m256 __A, __m256 __B) { return (__m256)__builtin_ia32_andps256((__v8sf)__A, (__v8sf)__B); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_andnot_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_andnot_pd(__m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_andnpd256((__v4df)__A, (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_andnot_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_andnot_ps(__m256 __A, __m256 __B) { return (__m256)__builtin_ia32_andnps256((__v8sf)__A, (__v8sf)__B); } #ifdef __OPTIMIZE__ -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_blend_pd(__m256d __X, __m256d __Y, const int __M) { +__funline __m256d _mm256_blend_pd(__m256d __X, __m256d __Y, const int __M) { return (__m256d)__builtin_ia32_blendpd256((__v4df)__X, (__v4df)__Y, __M); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_blend_ps(__m256 __X, __m256 __Y, const int __M) { +__funline __m256 _mm256_blend_ps(__m256 __X, __m256 __Y, const int __M) { return (__m256)__builtin_ia32_blendps256((__v8sf)__X, (__v8sf)__Y, __M); } #else @@ -136,36 +116,26 @@ extern __inline __m256 (int)(M))) #endif -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_blendv_pd(__m256d __X, __m256d __Y, __m256d __M) { +__funline __m256d _mm256_blendv_pd(__m256d __X, __m256d __Y, __m256d __M) { return (__m256d)__builtin_ia32_blendvpd256((__v4df)__X, (__v4df)__Y, (__v4df)__M); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_blendv_ps(__m256 __X, __m256 __Y, __m256 __M) { +__funline __m256 _mm256_blendv_ps(__m256 __X, __m256 __Y, __m256 __M) { return (__m256)__builtin_ia32_blendvps256((__v8sf)__X, (__v8sf)__Y, (__v8sf)__M); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_div_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_div_pd(__m256d __A, __m256d __B) { return (__m256d)((__v4df)__A / (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_div_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_div_ps(__m256 __A, __m256 __B) { return (__m256)((__v8sf)__A / (__v8sf)__B); } #ifdef __OPTIMIZE__ -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_dp_ps(__m256 __X, __m256 __Y, const int __M) { +__funline __m256 _mm256_dp_ps(__m256 __X, __m256 __Y, const int __M) { return (__m256)__builtin_ia32_dpps256((__v8sf)__X, (__v8sf)__Y, __M); } #else @@ -174,88 +144,60 @@ extern __inline __m256 (int)(M))) #endif -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_hadd_pd(__m256d __X, __m256d __Y) { +__funline __m256d _mm256_hadd_pd(__m256d __X, __m256d __Y) { return (__m256d)__builtin_ia32_haddpd256((__v4df)__X, (__v4df)__Y); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_hadd_ps(__m256 __X, __m256 __Y) { +__funline __m256 _mm256_hadd_ps(__m256 __X, __m256 __Y) { return (__m256)__builtin_ia32_haddps256((__v8sf)__X, (__v8sf)__Y); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_hsub_pd(__m256d __X, __m256d __Y) { +__funline __m256d _mm256_hsub_pd(__m256d __X, __m256d __Y) { return (__m256d)__builtin_ia32_hsubpd256((__v4df)__X, (__v4df)__Y); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_hsub_ps(__m256 __X, __m256 __Y) { +__funline __m256 _mm256_hsub_ps(__m256 __X, __m256 __Y) { return (__m256)__builtin_ia32_hsubps256((__v8sf)__X, (__v8sf)__Y); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_max_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_max_pd(__m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_maxpd256((__v4df)__A, (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_max_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_max_ps(__m256 __A, __m256 __B) { return (__m256)__builtin_ia32_maxps256((__v8sf)__A, (__v8sf)__B); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_min_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_min_pd(__m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_minpd256((__v4df)__A, (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_min_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_min_ps(__m256 __A, __m256 __B) { return (__m256)__builtin_ia32_minps256((__v8sf)__A, (__v8sf)__B); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mul_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_mul_pd(__m256d __A, __m256d __B) { return (__m256d)((__v4df)__A * (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mul_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_mul_ps(__m256 __A, __m256 __B) { return (__m256)((__v8sf)__A * (__v8sf)__B); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_or_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_or_pd(__m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_orpd256((__v4df)__A, (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_or_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_or_ps(__m256 __A, __m256 __B) { return (__m256)__builtin_ia32_orps256((__v8sf)__A, (__v8sf)__B); } #ifdef __OPTIMIZE__ -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shuffle_pd(__m256d __A, __m256d __B, const int __mask) { +__funline __m256d _mm256_shuffle_pd(__m256d __A, __m256d __B, const int __mask) { return (__m256d)__builtin_ia32_shufpd256((__v4df)__A, (__v4df)__B, __mask); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_shuffle_ps(__m256 __A, __m256 __B, const int __mask) { +__funline __m256 _mm256_shuffle_ps(__m256 __A, __m256 __B, const int __mask) { return (__m256)__builtin_ia32_shufps256((__v8sf)__A, (__v8sf)__B, __mask); } #else @@ -268,64 +210,44 @@ extern __inline __m256 (int)(N))) #endif -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sub_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_sub_pd(__m256d __A, __m256d __B) { return (__m256d)((__v4df)__A - (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sub_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_sub_ps(__m256 __A, __m256 __B) { return (__m256)((__v8sf)__A - (__v8sf)__B); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_xor_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_xor_pd(__m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_xorpd256((__v4df)__A, (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_xor_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_xor_ps(__m256 __A, __m256 __B) { return (__m256)__builtin_ia32_xorps256((__v8sf)__A, (__v8sf)__B); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_pd(__m128d __X, __m128d __Y, const int __P) { +__funline __m128d _mm_cmp_pd(__m128d __X, __m128d __Y, const int __P) { return (__m128d)__builtin_ia32_cmppd((__v2df)__X, (__v2df)__Y, __P); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_ps(__m128 __X, __m128 __Y, const int __P) { +__funline __m128 _mm_cmp_ps(__m128 __X, __m128 __Y, const int __P) { return (__m128)__builtin_ia32_cmpps((__v4sf)__X, (__v4sf)__Y, __P); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_pd(__m256d __X, __m256d __Y, const int __P) { +__funline __m256d _mm256_cmp_pd(__m256d __X, __m256d __Y, const int __P) { return (__m256d)__builtin_ia32_cmppd256((__v4df)__X, (__v4df)__Y, __P); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cmp_ps(__m256 __X, __m256 __Y, const int __P) { +__funline __m256 _mm256_cmp_ps(__m256 __X, __m256 __Y, const int __P) { return (__m256)__builtin_ia32_cmpps256((__v8sf)__X, (__v8sf)__Y, __P); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_sd(__m128d __X, __m128d __Y, const int __P) { +__funline __m128d _mm_cmp_sd(__m128d __X, __m128d __Y, const int __P) { return (__m128d)__builtin_ia32_cmpsd((__v2df)__X, (__v2df)__Y, __P); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmp_ss(__m128 __X, __m128 __Y, const int __P) { +__funline __m128 _mm_cmp_ss(__m128 __X, __m128 __Y, const int __P) { return (__m128)__builtin_ia32_cmpss((__v4sf)__X, (__v4sf)__Y, __P); } #else @@ -354,110 +276,76 @@ extern __inline __m128 (int)(P))) #endif -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi32_pd(__m128i __A) { +__funline __m256d _mm256_cvtepi32_pd(__m128i __A) { return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si)__A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtepi32_ps(__m256i __A) { +__funline __m256 _mm256_cvtepi32_ps(__m256i __A) { return (__m256)__builtin_ia32_cvtdq2ps256((__v8si)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtpd_ps(__m256d __A) { +__funline __m128 _mm256_cvtpd_ps(__m256d __A) { return (__m128)__builtin_ia32_cvtpd2ps256((__v4df)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtps_epi32(__m256 __A) { +__funline __m256i _mm256_cvtps_epi32(__m256 __A) { return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf)__A); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtps_pd(__m128 __A) { +__funline __m256d _mm256_cvtps_pd(__m128 __A) { return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvttpd_epi32(__m256d __A) { +__funline __m128i _mm256_cvttpd_epi32(__m256d __A) { return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtpd_epi32(__m256d __A) { +__funline __m128i _mm256_cvtpd_epi32(__m256d __A) { return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvttps_epi32(__m256 __A) { +__funline __m256i _mm256_cvttps_epi32(__m256 __A) { return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf)__A); } -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtsd_f64(__m256d __A) { +__funline double _mm256_cvtsd_f64(__m256d __A) { return __A[0]; } -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtss_f32(__m256 __A) { +__funline float _mm256_cvtss_f32(__m256 __A) { return __A[0]; } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extractf128_pd(__m256d __X, const int __N) { +__funline __m128d _mm256_extractf128_pd(__m256d __X, const int __N) { return (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__X, __N); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extractf128_ps(__m256 __X, const int __N) { +__funline __m128 _mm256_extractf128_ps(__m256 __X, const int __N) { return (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__X, __N); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extractf128_si256(__m256i __X, const int __N) { +__funline __m128i _mm256_extractf128_si256(__m256i __X, const int __N) { return (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__X, __N); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extract_epi32(__m256i __X, int const __N) { +__funline int _mm256_extract_epi32(__m256i __X, int const __N) { __m128i __Y = _mm256_extractf128_si256(__X, __N >> 2); return _mm_extract_epi32(__Y, __N % 4); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extract_epi16(__m256i __X, int const __N) { +__funline int _mm256_extract_epi16(__m256i __X, int const __N) { __m128i __Y = _mm256_extractf128_si256(__X, __N >> 3); return _mm_extract_epi16(__Y, __N % 8); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extract_epi8(__m256i __X, int const __N) { +__funline int _mm256_extract_epi8(__m256i __X, int const __N) { __m128i __Y = _mm256_extractf128_si256(__X, __N >> 4); return _mm_extract_epi8(__Y, __N % 16); } #ifdef __x86_64__ -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_extract_epi64(__m256i __X, const int __N) { +__funline long long _mm256_extract_epi64(__m256i __X, const int __N) { __m128i __Y = _mm256_extractf128_si256(__X, __N >> 1); return _mm_extract_epi64(__Y, __N % 2); } @@ -499,64 +387,44 @@ extern __inline long long #endif #endif -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_zeroall(void) { +__funline void _mm256_zeroall(void) { __builtin_ia32_vzeroall(); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_zeroupper(void) { +__funline void _mm256_zeroupper(void) { __builtin_ia32_vzeroupper(); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permutevar_pd(__m128d __A, __m128i __C) { +__funline __m128d _mm_permutevar_pd(__m128d __A, __m128i __C) { return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__A, (__v2di)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutevar_pd(__m256d __A, __m256i __C) { +__funline __m256d _mm256_permutevar_pd(__m256d __A, __m256i __C) { return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__A, (__v4di)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permutevar_ps(__m128 __A, __m128i __C) { +__funline __m128 _mm_permutevar_ps(__m128 __A, __m128i __C) { return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__A, (__v4si)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permutevar_ps(__m256 __A, __m256i __C) { +__funline __m256 _mm256_permutevar_ps(__m256 __A, __m256i __C) { return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__A, (__v8si)__C); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permute_pd(__m128d __X, const int __C) { +__funline __m128d _mm_permute_pd(__m128d __X, const int __C) { return (__m128d)__builtin_ia32_vpermilpd((__v2df)__X, __C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permute_pd(__m256d __X, const int __C) { +__funline __m256d _mm256_permute_pd(__m256d __X, const int __C) { return (__m256d)__builtin_ia32_vpermilpd256((__v4df)__X, __C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permute_ps(__m128 __X, const int __C) { +__funline __m128 _mm_permute_ps(__m128 __X, const int __C) { return (__m128)__builtin_ia32_vpermilps((__v4sf)__X, __C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permute_ps(__m256 __X, const int __C) { +__funline __m256 _mm256_permute_ps(__m256 __X, const int __C) { return (__m256)__builtin_ia32_vpermilps256((__v8sf)__X, __C); } #else @@ -574,22 +442,18 @@ extern __inline __m256 #endif #ifdef __OPTIMIZE__ -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permute2f128_pd(__m256d __X, __m256d __Y, const int __C) { +__funline __m256d _mm256_permute2f128_pd(__m256d __X, __m256d __Y, + const int __C) { return (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)__X, (__v4df)__Y, __C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permute2f128_ps(__m256 __X, __m256 __Y, const int __C) { +__funline __m256 _mm256_permute2f128_ps(__m256 __X, __m256 __Y, const int __C) { return (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)__X, (__v8sf)__Y, __C); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permute2f128_si256(__m256i __X, __m256i __Y, const int __C) { +__funline __m256i _mm256_permute2f128_si256(__m256i __X, __m256i __Y, + const int __C) { return (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)__X, (__v8si)__Y, __C); } @@ -607,86 +471,63 @@ extern __inline __m256i (__v8si)(__m256i)(Y), (int)(C))) #endif -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_broadcast_ss(float const *__X) { +__funline __m128 _mm_broadcast_ss(float const *__X) { return (__m128)__builtin_ia32_vbroadcastss(__X); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcast_sd(double const *__X) { +__funline __m256d _mm256_broadcast_sd(double const *__X) { return (__m256d)__builtin_ia32_vbroadcastsd256(__X); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcast_ss(float const *__X) { +__funline __m256 _mm256_broadcast_ss(float const *__X) { return (__m256)__builtin_ia32_vbroadcastss256(__X); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcast_pd(__m128d const *__X) { +__funline __m256d _mm256_broadcast_pd(__m128d const *__X) { return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__X); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_broadcast_ps(__m128 const *__X) { +__funline __m256 _mm256_broadcast_ps(__m128 const *__X) { return (__m256)__builtin_ia32_vbroadcastf128_ps256(__X); } #ifdef __OPTIMIZE__ -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_insertf128_pd(__m256d __X, __m128d __Y, const int __O) { +__funline __m256d _mm256_insertf128_pd(__m256d __X, __m128d __Y, const int __O) { return (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)__X, (__v2df)__Y, __O); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_insertf128_ps(__m256 __X, __m128 __Y, const int __O) { +__funline __m256 _mm256_insertf128_ps(__m256 __X, __m128 __Y, const int __O) { return (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)__X, (__v4sf)__Y, __O); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_insertf128_si256(__m256i __X, __m128i __Y, const int __O) { +__funline __m256i _mm256_insertf128_si256(__m256i __X, __m128i __Y, + const int __O) { return (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)__X, (__v4si)__Y, __O); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_insert_epi32(__m256i __X, int __D, int const __N) { +__funline __m256i _mm256_insert_epi32(__m256i __X, int __D, int const __N) { __m128i __Y = _mm256_extractf128_si256(__X, __N >> 2); __Y = _mm_insert_epi32(__Y, __D, __N % 4); return _mm256_insertf128_si256(__X, __Y, __N >> 2); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_insert_epi16(__m256i __X, int __D, int const __N) { +__funline __m256i _mm256_insert_epi16(__m256i __X, int __D, int const __N) { __m128i __Y = _mm256_extractf128_si256(__X, __N >> 3); __Y = _mm_insert_epi16(__Y, __D, __N % 8); return _mm256_insertf128_si256(__X, __Y, __N >> 3); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_insert_epi8(__m256i __X, int __D, int const __N) { +__funline __m256i _mm256_insert_epi8(__m256i __X, int __D, int const __N) { __m128i __Y = _mm256_extractf128_si256(__X, __N >> 4); __Y = _mm_insert_epi8(__Y, __D, __N % 16); return _mm256_insertf128_si256(__X, __Y, __N >> 4); } #ifdef __x86_64__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_insert_epi64(__m256i __X, long long __D, int const __N) { +__funline __m256i _mm256_insert_epi64(__m256i __X, long long __D, int const __N) { __m128i __Y = _mm256_extractf128_si256(__X, __N >> 1); __Y = _mm_insert_epi64(__Y, __D, __N % 2); return _mm256_insertf128_si256(__X, __Y, __N >> 1); @@ -736,203 +577,137 @@ extern __inline __m256i #endif #endif -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_load_pd(double const *__P) { +__funline __m256d _mm256_load_pd(double const *__P) { return *(__m256d *)__P; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_store_pd(double *__P, __m256d __A) { +__funline void _mm256_store_pd(double *__P, __m256d __A) { *(__m256d *)__P = __A; } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_load_ps(float const *__P) { +__funline __m256 _mm256_load_ps(float const *__P) { return *(__m256 *)__P; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_store_ps(float *__P, __m256 __A) { +__funline void _mm256_store_ps(float *__P, __m256 __A) { *(__m256 *)__P = __A; } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_loadu_pd(double const *__P) { +__funline __m256d _mm256_loadu_pd(double const *__P) { return *(__m256d_u *)__P; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_storeu_pd(double *__P, __m256d __A) { +__funline void _mm256_storeu_pd(double *__P, __m256d __A) { *(__m256d_u *)__P = __A; } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_loadu_ps(float const *__P) { +__funline __m256 _mm256_loadu_ps(float const *__P) { return *(__m256_u *)__P; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_storeu_ps(float *__P, __m256 __A) { +__funline void _mm256_storeu_ps(float *__P, __m256 __A) { *(__m256_u *)__P = __A; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_load_si256(__m256i const *__P) { +__funline __m256i _mm256_load_si256(__m256i const *__P) { return *__P; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_store_si256(__m256i *__P, __m256i __A) { +__funline void _mm256_store_si256(__m256i *__P, __m256i __A) { *__P = __A; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_loadu_si256(__m256i_u const *__P) { +__funline __m256i _mm256_loadu_si256(__m256i_u const *__P) { return *__P; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_storeu_si256(__m256i_u *__P, __m256i __A) { +__funline void _mm256_storeu_si256(__m256i_u *__P, __m256i __A) { *__P = __A; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskload_pd(double const *__P, __m128i __M) { +__funline __m128d _mm_maskload_pd(double const *__P, __m128i __M) { return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__P, (__v2di)__M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskstore_pd(double *__P, __m128i __M, __m128d __A) { +__funline void _mm_maskstore_pd(double *__P, __m128i __M, __m128d __A) { __builtin_ia32_maskstorepd((__v2df *)__P, (__v2di)__M, (__v2df)__A); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskload_pd(double const *__P, __m256i __M) { +__funline __m256d _mm256_maskload_pd(double const *__P, __m256i __M) { return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__P, (__v4di)__M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskstore_pd(double *__P, __m256i __M, __m256d __A) { +__funline void _mm256_maskstore_pd(double *__P, __m256i __M, __m256d __A) { __builtin_ia32_maskstorepd256((__v4df *)__P, (__v4di)__M, (__v4df)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskload_ps(float const *__P, __m128i __M) { +__funline __m128 _mm_maskload_ps(float const *__P, __m128i __M) { return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__P, (__v4si)__M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskstore_ps(float *__P, __m128i __M, __m128 __A) { +__funline void _mm_maskstore_ps(float *__P, __m128i __M, __m128 __A) { __builtin_ia32_maskstoreps((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskload_ps(float const *__P, __m256i __M) { +__funline __m256 _mm256_maskload_ps(float const *__P, __m256i __M) { return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__P, (__v8si)__M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskstore_ps(float *__P, __m256i __M, __m256 __A) { +__funline void _mm256_maskstore_ps(float *__P, __m256i __M, __m256 __A) { __builtin_ia32_maskstoreps256((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movehdup_ps(__m256 __X) { +__funline __m256 _mm256_movehdup_ps(__m256 __X) { return (__m256)__builtin_ia32_movshdup256((__v8sf)__X); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_moveldup_ps(__m256 __X) { +__funline __m256 _mm256_moveldup_ps(__m256 __X) { return (__m256)__builtin_ia32_movsldup256((__v8sf)__X); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movedup_pd(__m256d __X) { +__funline __m256d _mm256_movedup_pd(__m256d __X) { return (__m256d)__builtin_ia32_movddup256((__v4df)__X); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_lddqu_si256(__m256i const *__P) { +__funline __m256i _mm256_lddqu_si256(__m256i const *__P) { return (__m256i)__builtin_ia32_lddqu256((char const *)__P); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_stream_si256(__m256i *__A, __m256i __B) { +__funline void _mm256_stream_si256(__m256i *__A, __m256i __B) { __builtin_ia32_movntdq256((__v4di *)__A, (__v4di)__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_stream_pd(double *__A, __m256d __B) { +__funline void _mm256_stream_pd(double *__A, __m256d __B) { __builtin_ia32_movntpd256(__A, (__v4df)__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_stream_ps(float *__P, __m256 __A) { +__funline void _mm256_stream_ps(float *__P, __m256 __A) { __builtin_ia32_movntps256(__P, (__v8sf)__A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rcp_ps(__m256 __A) { +__funline __m256 _mm256_rcp_ps(__m256 __A) { return (__m256)__builtin_ia32_rcpps256((__v8sf)__A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_rsqrt_ps(__m256 __A) { +__funline __m256 _mm256_rsqrt_ps(__m256 __A) { return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__A); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sqrt_pd(__m256d __A) { +__funline __m256d _mm256_sqrt_pd(__m256d __A) { return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_sqrt_ps(__m256 __A) { +__funline __m256 _mm256_sqrt_ps(__m256 __A) { return (__m256)__builtin_ia32_sqrtps256((__v8sf)__A); } #ifdef __OPTIMIZE__ -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_round_pd(__m256d __V, const int __M) { +__funline __m256d _mm256_round_pd(__m256d __V, const int __M) { return (__m256d)__builtin_ia32_roundpd256((__v4df)__V, __M); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_round_ps(__m256 __V, const int __M) { +__funline __m256 _mm256_round_ps(__m256 __V, const int __M) { return (__m256)__builtin_ia32_roundps256((__v8sf)__V, __M); } #else @@ -948,211 +723,151 @@ extern __inline __m256 #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpackhi_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_unpackhi_pd(__m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_unpckhpd256((__v4df)__A, (__v4df)__B); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpacklo_pd(__m256d __A, __m256d __B) { +__funline __m256d _mm256_unpacklo_pd(__m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_unpcklpd256((__v4df)__A, (__v4df)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpackhi_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_unpackhi_ps(__m256 __A, __m256 __B) { return (__m256)__builtin_ia32_unpckhps256((__v8sf)__A, (__v8sf)__B); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_unpacklo_ps(__m256 __A, __m256 __B) { +__funline __m256 _mm256_unpacklo_ps(__m256 __A, __m256 __B) { return (__m256)__builtin_ia32_unpcklps256((__v8sf)__A, (__v8sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testz_pd(__m128d __M, __m128d __V) { +__funline int _mm_testz_pd(__m128d __M, __m128d __V) { return __builtin_ia32_vtestzpd((__v2df)__M, (__v2df)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testc_pd(__m128d __M, __m128d __V) { +__funline int _mm_testc_pd(__m128d __M, __m128d __V) { return __builtin_ia32_vtestcpd((__v2df)__M, (__v2df)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testnzc_pd(__m128d __M, __m128d __V) { +__funline int _mm_testnzc_pd(__m128d __M, __m128d __V) { return __builtin_ia32_vtestnzcpd((__v2df)__M, (__v2df)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testz_ps(__m128 __M, __m128 __V) { +__funline int _mm_testz_ps(__m128 __M, __m128 __V) { return __builtin_ia32_vtestzps((__v4sf)__M, (__v4sf)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testc_ps(__m128 __M, __m128 __V) { +__funline int _mm_testc_ps(__m128 __M, __m128 __V) { return __builtin_ia32_vtestcps((__v4sf)__M, (__v4sf)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testnzc_ps(__m128 __M, __m128 __V) { +__funline int _mm_testnzc_ps(__m128 __M, __m128 __V) { return __builtin_ia32_vtestnzcps((__v4sf)__M, (__v4sf)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testz_pd(__m256d __M, __m256d __V) { +__funline int _mm256_testz_pd(__m256d __M, __m256d __V) { return __builtin_ia32_vtestzpd256((__v4df)__M, (__v4df)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testc_pd(__m256d __M, __m256d __V) { +__funline int _mm256_testc_pd(__m256d __M, __m256d __V) { return __builtin_ia32_vtestcpd256((__v4df)__M, (__v4df)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testnzc_pd(__m256d __M, __m256d __V) { +__funline int _mm256_testnzc_pd(__m256d __M, __m256d __V) { return __builtin_ia32_vtestnzcpd256((__v4df)__M, (__v4df)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testz_ps(__m256 __M, __m256 __V) { +__funline int _mm256_testz_ps(__m256 __M, __m256 __V) { return __builtin_ia32_vtestzps256((__v8sf)__M, (__v8sf)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testc_ps(__m256 __M, __m256 __V) { +__funline int _mm256_testc_ps(__m256 __M, __m256 __V) { return __builtin_ia32_vtestcps256((__v8sf)__M, (__v8sf)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testnzc_ps(__m256 __M, __m256 __V) { +__funline int _mm256_testnzc_ps(__m256 __M, __m256 __V) { return __builtin_ia32_vtestnzcps256((__v8sf)__M, (__v8sf)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testz_si256(__m256i __M, __m256i __V) { +__funline int _mm256_testz_si256(__m256i __M, __m256i __V) { return __builtin_ia32_ptestz256((__v4di)__M, (__v4di)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testc_si256(__m256i __M, __m256i __V) { +__funline int _mm256_testc_si256(__m256i __M, __m256i __V) { return __builtin_ia32_ptestc256((__v4di)__M, (__v4di)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_testnzc_si256(__m256i __M, __m256i __V) { +__funline int _mm256_testnzc_si256(__m256i __M, __m256i __V) { return __builtin_ia32_ptestnzc256((__v4di)__M, (__v4di)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movemask_pd(__m256d __A) { +__funline int _mm256_movemask_pd(__m256d __A) { return __builtin_ia32_movmskpd256((__v4df)__A); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_movemask_ps(__m256 __A) { +__funline int _mm256_movemask_ps(__m256 __A) { return __builtin_ia32_movmskps256((__v8sf)__A); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_undefined_pd(void) { +__funline __m256d _mm256_undefined_pd(void) { __m256d __Y = __Y; return __Y; } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_undefined_ps(void) { +__funline __m256 _mm256_undefined_ps(void) { __m256 __Y = __Y; return __Y; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_undefined_si256(void) { +__funline __m256i _mm256_undefined_si256(void) { __m256i __Y = __Y; return __Y; } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setzero_pd(void) { +__funline __m256d _mm256_setzero_pd(void) { return __extension__(__m256d){0.0, 0.0, 0.0, 0.0}; } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setzero_ps(void) { +__funline __m256 _mm256_setzero_ps(void) { return __extension__(__m256){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setzero_si256(void) { +__funline __m256i _mm256_setzero_si256(void) { return __extension__(__m256i)(__v4di){0, 0, 0, 0}; } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set_pd(double __A, double __B, double __C, double __D) { +__funline __m256d _mm256_set_pd(double __A, double __B, double __C, double __D) { return __extension__(__m256d){__D, __C, __B, __A}; } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set_ps(float __A, float __B, float __C, float __D, float __E, - float __F, float __G, float __H) { +__funline __m256 _mm256_set_ps(float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H) { return __extension__(__m256){__H, __G, __F, __E, __D, __C, __B, __A}; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set_epi32(int __A, int __B, int __C, int __D, int __E, int __F, - int __G, int __H) { +__funline __m256i _mm256_set_epi32(int __A, int __B, int __C, int __D, int __E, + int __F, int __G, int __H) { return __extension__(__m256i)(__v8si){__H, __G, __F, __E, __D, __C, __B, __A}; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set_epi16(short __q15, short __q14, short __q13, short __q12, - short __q11, short __q10, short __q09, short __q08, - short __q07, short __q06, short __q05, short __q04, - short __q03, short __q02, short __q01, short __q00) { +__funline __m256i _mm256_set_epi16(short __q15, short __q14, short __q13, + short __q12, short __q11, short __q10, + short __q09, short __q08, short __q07, + short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, + short __q00) { return __extension__(__m256i)(__v16hi){ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set_epi8(char __q31, char __q30, char __q29, char __q28, char __q27, - char __q26, char __q25, char __q24, char __q23, char __q22, - char __q21, char __q20, char __q19, char __q18, char __q17, - char __q16, char __q15, char __q14, char __q13, char __q12, - char __q11, char __q10, char __q09, char __q08, char __q07, - char __q06, char __q05, char __q04, char __q03, char __q02, - char __q01, char __q00) { +__funline __m256i _mm256_set_epi8(char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, + char __q00) { return __extension__(__m256i)(__v32qi){ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, @@ -1160,91 +875,72 @@ extern __inline __m256i __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31}; } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_set_epi64x(long long __A, long long __B, long long __C, long long __D) { +__funline __m256i _mm256_set_epi64x(long long __A, long long __B, long long __C, + long long __D) { return __extension__(__m256i)(__v4di){__D, __C, __B, __A}; } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set1_pd(double __A) { +__funline __m256d _mm256_set1_pd(double __A) { return __extension__(__m256d){__A, __A, __A, __A}; } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set1_ps(float __A) { +__funline __m256 _mm256_set1_ps(float __A) { return __extension__(__m256){__A, __A, __A, __A, __A, __A, __A, __A}; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set1_epi32(int __A) { +__funline __m256i _mm256_set1_epi32(int __A) { return __extension__(__m256i)(__v8si){__A, __A, __A, __A, __A, __A, __A, __A}; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set1_epi16(short __A) { +__funline __m256i _mm256_set1_epi16(short __A) { return _mm256_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set1_epi8(char __A) { +__funline __m256i _mm256_set1_epi8(char __A) { return _mm256_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set1_epi64x(long long __A) { +__funline __m256i _mm256_set1_epi64x(long long __A) { return __extension__(__m256i)(__v4di){__A, __A, __A, __A}; } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setr_pd(double __A, double __B, double __C, double __D) { +__funline __m256d _mm256_setr_pd(double __A, double __B, double __C, double __D) { return _mm256_set_pd(__D, __C, __B, __A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setr_ps(float __A, float __B, float __C, float __D, float __E, - float __F, float __G, float __H) { +__funline __m256 _mm256_setr_ps(float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H) { return _mm256_set_ps(__H, __G, __F, __E, __D, __C, __B, __A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setr_epi32(int __A, int __B, int __C, int __D, int __E, int __F, - int __G, int __H) { +__funline __m256i _mm256_setr_epi32(int __A, int __B, int __C, int __D, int __E, + int __F, int __G, int __H) { return _mm256_set_epi32(__H, __G, __F, __E, __D, __C, __B, __A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setr_epi16(short __q15, short __q14, short __q13, short __q12, - short __q11, short __q10, short __q09, short __q08, - short __q07, short __q06, short __q05, short __q04, - short __q03, short __q02, short __q01, short __q00) { +__funline __m256i _mm256_setr_epi16(short __q15, short __q14, short __q13, + short __q12, short __q11, short __q10, + short __q09, short __q08, short __q07, + short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, + short __q00) { return _mm256_set_epi16(__q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setr_epi8(char __q31, char __q30, char __q29, char __q28, char __q27, - char __q26, char __q25, char __q24, char __q23, char __q22, - char __q21, char __q20, char __q19, char __q18, char __q17, - char __q16, char __q15, char __q14, char __q13, char __q12, - char __q11, char __q10, char __q09, char __q08, char __q07, - char __q06, char __q05, char __q04, char __q03, char __q02, - char __q01, char __q00) { +__funline __m256i _mm256_setr_epi8(char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, + char __q00) { return _mm256_set_epi8(__q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, @@ -1252,117 +948,80 @@ extern __inline __m256i __q31); } -extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm256_setr_epi64x(long long __A, long long __B, long long __C, long long __D) { +__funline __m256i _mm256_setr_epi64x(long long __A, long long __B, long long __C, + long long __D) { return _mm256_set_epi64x(__D, __C, __B, __A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castpd_ps(__m256d __A) { +__funline __m256 _mm256_castpd_ps(__m256d __A) { return (__m256)__A; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castpd_si256(__m256d __A) { +__funline __m256i _mm256_castpd_si256(__m256d __A) { return (__m256i)__A; } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castps_pd(__m256 __A) { +__funline __m256d _mm256_castps_pd(__m256 __A) { return (__m256d)__A; } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castps_si256(__m256 __A) { +__funline __m256i _mm256_castps_si256(__m256 __A) { return (__m256i)__A; } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castsi256_ps(__m256i __A) { +__funline __m256 _mm256_castsi256_ps(__m256i __A) { return (__m256)__A; } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castsi256_pd(__m256i __A) { +__funline __m256d _mm256_castsi256_pd(__m256i __A) { return (__m256d)__A; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castpd256_pd128(__m256d __A) { +__funline __m128d _mm256_castpd256_pd128(__m256d __A) { return (__m128d)__builtin_ia32_pd_pd256((__v4df)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castps256_ps128(__m256 __A) { +__funline __m128 _mm256_castps256_ps128(__m256 __A) { return (__m128)__builtin_ia32_ps_ps256((__v8sf)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castsi256_si128(__m256i __A) { +__funline __m128i _mm256_castsi256_si128(__m256i __A) { return (__m128i)__builtin_ia32_si_si256((__v8si)__A); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castpd128_pd256(__m128d __A) { +__funline __m256d _mm256_castpd128_pd256(__m128d __A) { return (__m256d)__builtin_ia32_pd256_pd((__v2df)__A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castps128_ps256(__m128 __A) { +__funline __m256 _mm256_castps128_ps256(__m128 __A) { return (__m256)__builtin_ia32_ps256_ps((__v4sf)__A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_castsi128_si256(__m128i __A) { +__funline __m256i _mm256_castsi128_si256(__m128i __A) { return (__m256i)__builtin_ia32_si256_si((__v4si)__A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set_m128(__m128 __H, __m128 __L) { +__funline __m256 _mm256_set_m128(__m128 __H, __m128 __L) { return _mm256_insertf128_ps(_mm256_castps128_ps256(__L), __H, 1); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set_m128d(__m128d __H, __m128d __L) { +__funline __m256d _mm256_set_m128d(__m128d __H, __m128d __L) { return _mm256_insertf128_pd(_mm256_castpd128_pd256(__L), __H, 1); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_set_m128i(__m128i __H, __m128i __L) { +__funline __m256i _mm256_set_m128i(__m128i __H, __m128i __L) { return _mm256_insertf128_si256(_mm256_castsi128_si256(__L), __H, 1); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setr_m128(__m128 __L, __m128 __H) { +__funline __m256 _mm256_setr_m128(__m128 __L, __m128 __H) { return _mm256_set_m128(__H, __L); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setr_m128d(__m128d __L, __m128d __H) { +__funline __m256d _mm256_setr_m128d(__m128d __L, __m128d __H) { return _mm256_set_m128d(__H, __L); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_setr_m128i(__m128i __L, __m128i __H) { +__funline __m256i _mm256_setr_m128i(__m128i __L, __m128i __H) { return _mm256_set_m128i(__H, __L); } diff --git a/third_party/intel/bmi2intrin.internal.h b/third_party/intel/bmi2intrin.internal.h index d4c1e7499..15ba16ae7 100644 --- a/third_party/intel/bmi2intrin.internal.h +++ b/third_party/intel/bmi2intrin.internal.h @@ -11,48 +11,38 @@ #define __DISABLE_BMI2__ #endif /* __BMI2__ */ -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _bzhi_u32(unsigned int __X, unsigned int __Y) { +__funline unsigned int _bzhi_u32(unsigned int __X, unsigned int __Y) { return __builtin_ia32_bzhi_si(__X, __Y); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _pdep_u32(unsigned int __X, unsigned int __Y) { +__funline unsigned int _pdep_u32(unsigned int __X, unsigned int __Y) { return __builtin_ia32_pdep_si(__X, __Y); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _pext_u32(unsigned int __X, unsigned int __Y) { +__funline unsigned int _pext_u32(unsigned int __X, unsigned int __Y) { return __builtin_ia32_pext_si(__X, __Y); } #ifdef __x86_64__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _bzhi_u64(unsigned long long __X, unsigned long long __Y) { +__funline unsigned long long _bzhi_u64(unsigned long long __X, + unsigned long long __Y) { return __builtin_ia32_bzhi_di(__X, __Y); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _pdep_u64(unsigned long long __X, unsigned long long __Y) { +__funline unsigned long long _pdep_u64(unsigned long long __X, + unsigned long long __Y) { return __builtin_ia32_pdep_di(__X, __Y); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _pext_u64(unsigned long long __X, unsigned long long __Y) { +__funline unsigned long long _pext_u64(unsigned long long __X, + unsigned long long __Y) { return __builtin_ia32_pext_di(__X, __Y); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mulx_u64(unsigned long long __X, unsigned long long __Y, - unsigned long long *__P) { +__funline unsigned long long _mulx_u64(unsigned long long __X, + unsigned long long __Y, + unsigned long long *__P) { unsigned __int128 __res = (unsigned __int128)__X * __Y; *__P = (unsigned long long)(__res >> 64); return (unsigned long long)__res; @@ -60,9 +50,8 @@ extern __inline unsigned long long #else /* !__x86_64__ */ -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { +__funline unsigned int _mulx_u32(unsigned int __X, unsigned int __Y, + unsigned int *__P) { unsigned long long __res = (unsigned long long)__X * __Y; *__P = (unsigned int)(__res >> 32); return (unsigned int)__res; diff --git a/third_party/intel/cetintrin.internal.h b/third_party/intel/cetintrin.internal.h index fa31a21b5..63617f28e 100644 --- a/third_party/intel/cetintrin.internal.h +++ b/third_party/intel/cetintrin.internal.h @@ -12,22 +12,16 @@ #endif /* __SHSTK__ */ #ifdef __x86_64__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _get_ssp(void) { +__funline unsigned long long _get_ssp(void) { return __builtin_ia32_rdsspq(); } #else -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _get_ssp(void) { +__funline unsigned int _get_ssp(void) { return __builtin_ia32_rdsspd(); } #endif -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _inc_ssp(unsigned int __B) { +__funline void _inc_ssp(unsigned int __B) { #ifdef __x86_64__ __builtin_ia32_incsspq((unsigned long long)__B); #else @@ -35,55 +29,39 @@ extern __inline void #endif } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _saveprevssp(void) { +__funline void _saveprevssp(void) { __builtin_ia32_saveprevssp(); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _rstorssp(void *__B) { +__funline void _rstorssp(void *__B) { __builtin_ia32_rstorssp(__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _wrssd(unsigned int __B, void *__C) { +__funline void _wrssd(unsigned int __B, void *__C) { __builtin_ia32_wrssd(__B, __C); } #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _wrssq(unsigned long long __B, void *__C) { +__funline void _wrssq(unsigned long long __B, void *__C) { __builtin_ia32_wrssq(__B, __C); } #endif -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _wrussd(unsigned int __B, void *__C) { +__funline void _wrussd(unsigned int __B, void *__C) { __builtin_ia32_wrussd(__B, __C); } #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _wrussq(unsigned long long __B, void *__C) { +__funline void _wrussq(unsigned long long __B, void *__C) { __builtin_ia32_wrussq(__B, __C); } #endif -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _setssbsy(void) { +__funline void _setssbsy(void) { __builtin_ia32_setssbsy(); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _clrssbsy(void *__B) { +__funline void _clrssbsy(void *__B) { __builtin_ia32_clrssbsy(__B); } diff --git a/third_party/intel/cldemoteintrin.internal.h b/third_party/intel/cldemoteintrin.internal.h index 7a053d6fa..ee9d1eefb 100644 --- a/third_party/intel/cldemoteintrin.internal.h +++ b/third_party/intel/cldemoteintrin.internal.h @@ -10,9 +10,7 @@ #pragma GCC target("cldemote") #define __DISABLE_CLDEMOTE__ #endif /* __CLDEMOTE__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cldemote(void *__A) { +__funline void _cldemote(void *__A) { __builtin_ia32_cldemote(__A); } #ifdef __DISABLE_CLDEMOTE__ diff --git a/third_party/intel/clflushoptintrin.internal.h b/third_party/intel/clflushoptintrin.internal.h index da1d119eb..cd974e3b4 100644 --- a/third_party/intel/clflushoptintrin.internal.h +++ b/third_party/intel/clflushoptintrin.internal.h @@ -11,9 +11,7 @@ #define __DISABLE_CLFLUSHOPT__ #endif /* __CLFLUSHOPT__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_clflushopt(void *__A) { +__funline void _mm_clflushopt(void *__A) { __builtin_ia32_clflushopt(__A); } diff --git a/third_party/intel/clwbintrin.internal.h b/third_party/intel/clwbintrin.internal.h index 3180c94db..8f6f9d7ed 100644 --- a/third_party/intel/clwbintrin.internal.h +++ b/third_party/intel/clwbintrin.internal.h @@ -11,9 +11,7 @@ #define __DISABLE_CLWB__ #endif /* __CLWB__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_clwb(void *__A) { +__funline void _mm_clwb(void *__A) { __builtin_ia32_clwb(__A); } diff --git a/third_party/intel/clzerointrin.internal.h b/third_party/intel/clzerointrin.internal.h index a8db77bc2..c9261ed09 100644 --- a/third_party/intel/clzerointrin.internal.h +++ b/third_party/intel/clzerointrin.internal.h @@ -8,9 +8,7 @@ #define __DISABLE_CLZERO__ #endif /* __CLZERO__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_clzero(void* __I) { +__funline void _mm_clzero(void* __I) { __builtin_ia32_clzero(__I); } diff --git a/third_party/intel/emmintrin.internal.h b/third_party/intel/emmintrin.internal.h index e7ec5aad7..712dbfb41 100644 --- a/third_party/intel/emmintrin.internal.h +++ b/third_party/intel/emmintrin.internal.h @@ -30,818 +30,558 @@ typedef double __m128d_u #define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0)) -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_sd(double __F) { +__funline __m128d _mm_set_sd(double __F) { return __extension__(__m128d){__F, 0.0}; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set1_pd(double __F) { +__funline __m128d _mm_set1_pd(double __F) { return __extension__(__m128d){__F, __F}; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_pd1(double __F) { +__funline __m128d _mm_set_pd1(double __F) { return _mm_set1_pd(__F); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_pd(double __W, double __X) { +__funline __m128d _mm_set_pd(double __W, double __X) { return __extension__(__m128d){__X, __W}; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setr_pd(double __W, double __X) { +__funline __m128d _mm_setr_pd(double __W, double __X) { return __extension__(__m128d){__W, __X}; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_undefined_pd(void) { +__funline __m128d _mm_undefined_pd(void) { __m128d __Y = __Y; return __Y; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setzero_pd(void) { +__funline __m128d _mm_setzero_pd(void) { return __extension__(__m128d){0.0, 0.0}; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_move_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_move_sd(__m128d __A, __m128d __B) { return __extension__(__m128d) __builtin_shuffle((__v2df)__A, (__v2df)__B, (__v2di){2, 1}); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_load_pd(double const *__P) { +__funline __m128d _mm_load_pd(double const *__P) { return *(__m128d *)__P; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadu_pd(double const *__P) { +__funline __m128d _mm_loadu_pd(double const *__P) { return *(__m128d_u *)__P; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_load1_pd(double const *__P) { +__funline __m128d _mm_load1_pd(double const *__P) { return _mm_set1_pd(*__P); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_load_sd(double const *__P) { +__funline __m128d _mm_load_sd(double const *__P) { return _mm_set_sd(*__P); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_load_pd1(double const *__P) { +__funline __m128d _mm_load_pd1(double const *__P) { return _mm_load1_pd(__P); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadr_pd(double const *__P) { +__funline __m128d _mm_loadr_pd(double const *__P) { __m128d __tmp = _mm_load_pd(__P); return __builtin_ia32_shufpd(__tmp, __tmp, _MM_SHUFFLE2(0, 1)); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_store_pd(double *__P, __m128d __A) { +__funline void _mm_store_pd(double *__P, __m128d __A) { *(__m128d *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storeu_pd(double *__P, __m128d __A) { +__funline void _mm_storeu_pd(double *__P, __m128d __A) { *(__m128d_u *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_store_sd(double *__P, __m128d __A) { +__funline void _mm_store_sd(double *__P, __m128d __A) { *__P = ((__v2df)__A)[0]; } -extern __inline double - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsd_f64(__m128d __A) { +__funline double _mm_cvtsd_f64(__m128d __A) { return ((__v2df)__A)[0]; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storel_pd(double *__P, __m128d __A) { +__funline void _mm_storel_pd(double *__P, __m128d __A) { _mm_store_sd(__P, __A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storeh_pd(double *__P, __m128d __A) { +__funline void _mm_storeh_pd(double *__P, __m128d __A) { *__P = ((__v2df)__A)[1]; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_store1_pd(double *__P, __m128d __A) { +__funline void _mm_store1_pd(double *__P, __m128d __A) { _mm_store_pd(__P, __builtin_ia32_shufpd(__A, __A, _MM_SHUFFLE2(0, 0))); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_store_pd1(double *__P, __m128d __A) { +__funline void _mm_store_pd1(double *__P, __m128d __A) { _mm_store1_pd(__P, __A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storer_pd(double *__P, __m128d __A) { +__funline void _mm_storer_pd(double *__P, __m128d __A) { _mm_store_pd(__P, __builtin_ia32_shufpd(__A, __A, _MM_SHUFFLE2(0, 1))); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi128_si32(__m128i __A) { +__funline int _mm_cvtsi128_si32(__m128i __A) { return __builtin_ia32_vec_ext_v4si((__v4si)__A, 0); } #ifdef __x86_64__ -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi128_si64(__m128i __A) { +__funline long long _mm_cvtsi128_si64(__m128i __A) { return ((__v2di)__A)[0]; } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi128_si64x(__m128i __A) { +__funline long long _mm_cvtsi128_si64x(__m128i __A) { return ((__v2di)__A)[0]; } #endif -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_add_pd(__m128d __A, __m128d __B) { return (__m128d)((__v2df)__A + (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_add_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_addsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_sub_pd(__m128d __A, __m128d __B) { return (__m128d)((__v2df)__A - (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_sub_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_subsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mul_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_mul_pd(__m128d __A, __m128d __B) { return (__m128d)((__v2df)__A * (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mul_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_mul_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_mulsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_div_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_div_pd(__m128d __A, __m128d __B) { return (__m128d)((__v2df)__A / (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_div_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_div_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_divsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sqrt_pd(__m128d __A) { +__funline __m128d _mm_sqrt_pd(__m128d __A) { return (__m128d)__builtin_ia32_sqrtpd((__v2df)__A); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sqrt_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_sqrt_sd(__m128d __A, __m128d __B) { __v2df __tmp = __builtin_ia32_movsd((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_sqrtsd((__v2df)__tmp); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_min_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_minpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_min_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_minsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_max_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_maxpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_max_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_maxsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_and_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_and_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_andpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_andnot_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_andnot_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_andnpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_or_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_or_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_orpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_xor_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_xor_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_xorpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpeq_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmplt_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpltpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmple_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmplepd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpgt_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpgtpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpge_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpgepd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpneq_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnlt_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpnlt_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnle_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpnle_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpngt_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpngt_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpngtpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnge_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpnge_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpngepd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpord_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpord_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpordpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpunord_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpunord_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpeq_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmplt_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpltsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmple_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmplesd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpgt_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_movsd( (__v2df)__A, (__v2df)__builtin_ia32_cmpltsd((__v2df)__B, (__v2df)__A)); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpge_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_movsd( (__v2df)__A, (__v2df)__builtin_ia32_cmplesd((__v2df)__B, (__v2df)__A)); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpneq_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnlt_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpnlt_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnle_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpnle_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpngt_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpngt_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_movsd( (__v2df)__A, (__v2df)__builtin_ia32_cmpnltsd((__v2df)__B, (__v2df)__A)); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnge_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpnge_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_movsd( (__v2df)__A, (__v2df)__builtin_ia32_cmpnlesd((__v2df)__B, (__v2df)__A)); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpord_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpord_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpordsd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpunord_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_cmpunord_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comieq_sd(__m128d __A, __m128d __B) { +__funline int _mm_comieq_sd(__m128d __A, __m128d __B) { return __builtin_ia32_comisdeq((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comilt_sd(__m128d __A, __m128d __B) { +__funline int _mm_comilt_sd(__m128d __A, __m128d __B) { return __builtin_ia32_comisdlt((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comile_sd(__m128d __A, __m128d __B) { +__funline int _mm_comile_sd(__m128d __A, __m128d __B) { return __builtin_ia32_comisdle((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comigt_sd(__m128d __A, __m128d __B) { +__funline int _mm_comigt_sd(__m128d __A, __m128d __B) { return __builtin_ia32_comisdgt((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comige_sd(__m128d __A, __m128d __B) { +__funline int _mm_comige_sd(__m128d __A, __m128d __B) { return __builtin_ia32_comisdge((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comineq_sd(__m128d __A, __m128d __B) { +__funline int _mm_comineq_sd(__m128d __A, __m128d __B) { return __builtin_ia32_comisdneq((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomieq_sd(__m128d __A, __m128d __B) { +__funline int _mm_ucomieq_sd(__m128d __A, __m128d __B) { return __builtin_ia32_ucomisdeq((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomilt_sd(__m128d __A, __m128d __B) { +__funline int _mm_ucomilt_sd(__m128d __A, __m128d __B) { return __builtin_ia32_ucomisdlt((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomile_sd(__m128d __A, __m128d __B) { +__funline int _mm_ucomile_sd(__m128d __A, __m128d __B) { return __builtin_ia32_ucomisdle((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomigt_sd(__m128d __A, __m128d __B) { +__funline int _mm_ucomigt_sd(__m128d __A, __m128d __B) { return __builtin_ia32_ucomisdgt((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomige_sd(__m128d __A, __m128d __B) { +__funline int _mm_ucomige_sd(__m128d __A, __m128d __B) { return __builtin_ia32_ucomisdge((__v2df)__A, (__v2df)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomineq_sd(__m128d __A, __m128d __B) { +__funline int _mm_ucomineq_sd(__m128d __A, __m128d __B) { return __builtin_ia32_ucomisdneq((__v2df)__A, (__v2df)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_epi64x(long long __q1, long long __q0) { +__funline __m128i _mm_set_epi64x(long long __q1, long long __q0) { return __extension__(__m128i)(__v2di){__q0, __q1}; } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_epi64(__m64 __q1, __m64 __q0) { +__funline __m128i _mm_set_epi64(__m64 __q1, __m64 __q0) { return _mm_set_epi64x((long long)__q1, (long long)__q0); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) { +__funline __m128i _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) { return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3}; } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3, - short __q2, short __q1, short __q0) { +__funline __m128i _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, + short __q3, short __q2, short __q1, short __q0) { return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7}; } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11, - char __q10, char __q09, char __q08, char __q07, char __q06, - char __q05, char __q04, char __q03, char __q02, char __q01, - char __q00) { +__funline __m128i _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) { return __extension__(__m128i)(__v16qi){ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15}; } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set1_epi64x(long long __A) { +__funline __m128i _mm_set1_epi64x(long long __A) { return _mm_set_epi64x(__A, __A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set1_epi64(__m64 __A) { +__funline __m128i _mm_set1_epi64(__m64 __A) { return _mm_set_epi64(__A, __A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set1_epi32(int __A) { +__funline __m128i _mm_set1_epi32(int __A) { return _mm_set_epi32(__A, __A, __A, __A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set1_epi16(short __A) { +__funline __m128i _mm_set1_epi16(short __A) { return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set1_epi8(char __A) { +__funline __m128i _mm_set1_epi8(char __A) { return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setr_epi64(__m64 __q0, __m64 __q1) { +__funline __m128i _mm_setr_epi64(__m64 __q0, __m64 __q1) { return _mm_set_epi64(__q1, __q0); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) { +__funline __m128i _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) { return _mm_set_epi32(__q3, __q2, __q1, __q0); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4, - short __q5, short __q6, short __q7) { +__funline __m128i _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, + short __q4, short __q5, short __q6, short __q7) { return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04, - char __q05, char __q06, char __q07, char __q08, char __q09, - char __q10, char __q11, char __q12, char __q13, char __q14, - char __q15) { +__funline __m128i _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, + char __q04, char __q05, char __q06, char __q07, + char __q08, char __q09, char __q10, char __q11, + char __q12, char __q13, char __q14, char __q15) { return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_load_si128(__m128i const *__P) { +__funline __m128i _mm_load_si128(__m128i const *__P) { return *__P; } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadu_si128(__m128i_u const *__P) { +__funline __m128i _mm_loadu_si128(__m128i_u const *__P) { return *__P; } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadl_epi64(__m128i_u const *__P) { +__funline __m128i _mm_loadl_epi64(__m128i_u const *__P) { return _mm_set_epi64((__m64)0LL, *(__m64_u *)__P); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadu_si64(void const *__P) { +__funline __m128i _mm_loadu_si64(void const *__P) { return _mm_loadl_epi64((__m128i_u *)__P); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_store_si128(__m128i *__P, __m128i __B) { +__funline void _mm_store_si128(__m128i *__P, __m128i __B) { *__P = __B; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storeu_si128(__m128i_u *__P, __m128i __B) { +__funline void _mm_storeu_si128(__m128i_u *__P, __m128i __B) { *__P = __B; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storel_epi64(__m128i_u *__P, __m128i __B) { +__funline void _mm_storel_epi64(__m128i_u *__P, __m128i __B) { *(__m64_u *)__P = (__m64)((__v2di)__B)[0]; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storeu_si64(void *__P, __m128i __B) { +__funline void _mm_storeu_si64(void *__P, __m128i __B) { _mm_storel_epi64((__m128i_u *)__P, __B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movepi64_pi64(__m128i __B) { +__funline __m64 _mm_movepi64_pi64(__m128i __B) { return (__m64)((__v2di)__B)[0]; } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movpi64_epi64(__m64 __A) { +__funline __m128i _mm_movpi64_epi64(__m64 __A) { return _mm_set_epi64((__m64)0LL, __A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_move_epi64(__m128i __A) { +__funline __m128i _mm_move_epi64(__m128i __A) { return (__m128i)__builtin_ia32_movq128((__v2di)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_undefined_si128(void) { +__funline __m128i _mm_undefined_si128(void) { __m128i __Y = __Y; return __Y; } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setzero_si128(void) { +__funline __m128i _mm_setzero_si128(void) { return __extension__(__m128i)(__v4si){0, 0, 0, 0}; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi32_pd(__m128i __A) { +__funline __m128d _mm_cvtepi32_pd(__m128i __A) { return (__m128d)__builtin_ia32_cvtdq2pd((__v4si)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi32_ps(__m128i __A) { +__funline __m128 _mm_cvtepi32_ps(__m128i __A) { return (__m128)__builtin_ia32_cvtdq2ps((__v4si)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpd_epi32(__m128d __A) { +__funline __m128i _mm_cvtpd_epi32(__m128d __A) { return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpd_pi32(__m128d __A) { +__funline __m64 _mm_cvtpd_pi32(__m128d __A) { return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpd_ps(__m128d __A) { +__funline __m128 _mm_cvtpd_ps(__m128d __A) { return (__m128)__builtin_ia32_cvtpd2ps((__v2df)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttpd_epi32(__m128d __A) { +__funline __m128i _mm_cvttpd_epi32(__m128d __A) { return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttpd_pi32(__m128d __A) { +__funline __m64 _mm_cvttpd_pi32(__m128d __A) { return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__A); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpi32_pd(__m64 __A) { +__funline __m128d _mm_cvtpi32_pd(__m64 __A) { return (__m128d)__builtin_ia32_cvtpi2pd((__v2si)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtps_epi32(__m128 __A) { +__funline __m128i _mm_cvtps_epi32(__m128 __A) { return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttps_epi32(__m128 __A) { +__funline __m128i _mm_cvttps_epi32(__m128 __A) { return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__A); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtps_pd(__m128 __A) { +__funline __m128d _mm_cvtps_pd(__m128 __A) { return (__m128d)__builtin_ia32_cvtps2pd((__v4sf)__A); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsd_si32(__m128d __A) { +__funline int _mm_cvtsd_si32(__m128d __A) { return __builtin_ia32_cvtsd2si((__v2df)__A); } #ifdef __x86_64__ -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsd_si64(__m128d __A) { +__funline long long _mm_cvtsd_si64(__m128d __A) { return __builtin_ia32_cvtsd2si64((__v2df)__A); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsd_si64x(__m128d __A) { +__funline long long _mm_cvtsd_si64x(__m128d __A) { return __builtin_ia32_cvtsd2si64((__v2df)__A); } #endif -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttsd_si32(__m128d __A) { +__funline int _mm_cvttsd_si32(__m128d __A) { return __builtin_ia32_cvttsd2si((__v2df)__A); } #ifdef __x86_64__ -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttsd_si64(__m128d __A) { +__funline long long _mm_cvttsd_si64(__m128d __A) { return __builtin_ia32_cvttsd2si64((__v2df)__A); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttsd_si64x(__m128d __A) { +__funline long long _mm_cvttsd_si64x(__m128d __A) { return __builtin_ia32_cvttsd2si64((__v2df)__A); } #endif -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsd_ss(__m128 __A, __m128d __B) { +__funline __m128 _mm_cvtsd_ss(__m128 __A, __m128d __B) { return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi32_sd(__m128d __A, int __B) { +__funline __m128d _mm_cvtsi32_sd(__m128d __A, int __B) { return (__m128d)__builtin_ia32_cvtsi2sd((__v2df)__A, __B); } #ifdef __x86_64__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi64_sd(__m128d __A, long long __B) { +__funline __m128d _mm_cvtsi64_sd(__m128d __A, long long __B) { return (__m128d)__builtin_ia32_cvtsi642sd((__v2df)__A, __B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi64x_sd(__m128d __A, long long __B) { +__funline __m128d _mm_cvtsi64x_sd(__m128d __A, long long __B) { return (__m128d)__builtin_ia32_cvtsi642sd((__v2df)__A, __B); } #endif -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtss_sd(__m128d __A, __m128 __B) { +__funline __m128d _mm_cvtss_sd(__m128d __A, __m128 __B) { return (__m128d)__builtin_ia32_cvtss2sd((__v2df)__A, (__v4sf)__B); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) { +__funline __m128d _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) { return (__m128d)__builtin_ia32_shufpd((__v2df)__A, (__v2df)__B, __mask); } #else @@ -850,280 +590,188 @@ extern __inline __m128d (int)(N))) #endif -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpackhi_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_unpackhi_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_unpckhpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpacklo_pd(__m128d __A, __m128d __B) { +__funline __m128d _mm_unpacklo_pd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_unpcklpd((__v2df)__A, (__v2df)__B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadh_pd(__m128d __A, double const *__B) { +__funline __m128d _mm_loadh_pd(__m128d __A, double const *__B) { return (__m128d)__builtin_ia32_loadhpd((__v2df)__A, __B); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadl_pd(__m128d __A, double const *__B) { +__funline __m128d _mm_loadl_pd(__m128d __A, double const *__B) { return (__m128d)__builtin_ia32_loadlpd((__v2df)__A, __B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movemask_pd(__m128d __A) { +__funline int _mm_movemask_pd(__m128d __A) { return __builtin_ia32_movmskpd((__v2df)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_packs_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_packs_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_packsswb128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_packs_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_packs_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_packssdw128((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_packus_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_packus_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_packuswb128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpackhi_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_unpackhi_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpackhi_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_unpackhi_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpackhi_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_unpackhi_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpackhi_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_unpackhi_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpacklo_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_unpacklo_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpacklo_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_unpacklo_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpacklo_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_unpacklo_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_punpckldq128((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpacklo_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_unpacklo_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_add_epi8(__m128i __A, __m128i __B) { return (__m128i)((__v16qu)__A + (__v16qu)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_add_epi16(__m128i __A, __m128i __B) { return (__m128i)((__v8hu)__A + (__v8hu)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_add_epi32(__m128i __A, __m128i __B) { return (__m128i)((__v4su)__A + (__v4su)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_add_epi64(__m128i __A, __m128i __B) { return (__m128i)((__v2du)__A + (__v2du)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_adds_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_adds_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddsb128((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_adds_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_adds_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddsw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_adds_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_adds_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddusb128((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_adds_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_adds_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddusw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_sub_epi8(__m128i __A, __m128i __B) { return (__m128i)((__v16qu)__A - (__v16qu)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_sub_epi16(__m128i __A, __m128i __B) { return (__m128i)((__v8hu)__A - (__v8hu)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_sub_epi32(__m128i __A, __m128i __B) { return (__m128i)((__v4su)__A - (__v4su)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_sub_epi64(__m128i __A, __m128i __B) { return (__m128i)((__v2du)__A - (__v2du)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_subs_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_subs_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubsb128((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_subs_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_subs_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubsw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_subs_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_subs_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubusb128((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_subs_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_subs_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psubusw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_madd_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_madd_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mulhi_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_mulhi_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mullo_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_mullo_epi16(__m128i __A, __m128i __B) { return (__m128i)((__v8hu)__A * (__v8hu)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mul_su32(__m64 __A, __m64 __B) { +__funline __m64 _mm_mul_su32(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pmuludq((__v2si)__A, (__v2si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mul_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_mul_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmuludq128((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_slli_epi16(__m128i __A, int __B) { +__funline __m128i _mm_slli_epi16(__m128i __A, int __B) { return (__m128i)__builtin_ia32_psllwi128((__v8hi)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_slli_epi32(__m128i __A, int __B) { +__funline __m128i _mm_slli_epi32(__m128i __A, int __B) { return (__m128i)__builtin_ia32_pslldi128((__v4si)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_slli_epi64(__m128i __A, int __B) { +__funline __m128i _mm_slli_epi64(__m128i __A, int __B) { return (__m128i)__builtin_ia32_psllqi128((__v2di)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srai_epi16(__m128i __A, int __B) { +__funline __m128i _mm_srai_epi16(__m128i __A, int __B) { return (__m128i)__builtin_ia32_psrawi128((__v8hi)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srai_epi32(__m128i __A, int __B) { +__funline __m128i _mm_srai_epi32(__m128i __A, int __B) { return (__m128i)__builtin_ia32_psradi128((__v4si)__A, __B); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_bsrli_si128(__m128i __A, const int __N) { +__funline __m128i _mm_bsrli_si128(__m128i __A, const int __N) { return (__m128i)__builtin_ia32_psrldqi128(__A, __N * 8); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_bslli_si128(__m128i __A, const int __N) { +__funline __m128i _mm_bslli_si128(__m128i __A, const int __N) { return (__m128i)__builtin_ia32_pslldqi128(__A, __N * 8); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srli_si128(__m128i __A, const int __N) { +__funline __m128i _mm_srli_si128(__m128i __A, const int __N) { return (__m128i)__builtin_ia32_psrldqi128(__A, __N * 8); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_slli_si128(__m128i __A, const int __N) { +__funline __m128i _mm_slli_si128(__m128i __A, const int __N) { return (__m128i)__builtin_ia32_pslldqi128(__A, __N * 8); } #else @@ -1137,160 +785,109 @@ extern __inline __m128i ((__m128i)__builtin_ia32_pslldqi128((__m128i)(A), (int)(N)*8)) #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srli_epi16(__m128i __A, int __B) { +__funline __m128i _mm_srli_epi16(__m128i __A, int __B) { return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srli_epi32(__m128i __A, int __B) { +__funline __m128i _mm_srli_epi32(__m128i __A, int __B) { return (__m128i)__builtin_ia32_psrldi128((__v4si)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srli_epi64(__m128i __A, int __B) { +__funline __m128i _mm_srli_epi64(__m128i __A, int __B) { return (__m128i)__builtin_ia32_psrlqi128((__v2di)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sll_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_sll_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sll_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_sll_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sll_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_sll_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sra_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_sra_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psraw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sra_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_sra_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrad128((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srl_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_srl_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrlw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srl_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_srl_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrld128((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srl_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_srl_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrlq128((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_and_si128(__m128i __A, __m128i __B) { +__funline __m128i _mm_and_si128(__m128i __A, __m128i __B) { return (__m128i)((__v2du)__A & (__v2du)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_andnot_si128(__m128i __A, __m128i __B) { +__funline __m128i _mm_andnot_si128(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pandn128((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_or_si128(__m128i __A, __m128i __B) { +__funline __m128i _mm_or_si128(__m128i __A, __m128i __B) { return (__m128i)((__v2du)__A | (__v2du)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_xor_si128(__m128i __A, __m128i __B) { +__funline __m128i _mm_xor_si128(__m128i __A, __m128i __B) { return (__m128i)((__v2du)__A ^ (__v2du)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_cmpeq_epi8(__m128i __A, __m128i __B) { return (__m128i)((__v16qs)__A == (__v16qs)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_cmpeq_epi16(__m128i __A, __m128i __B) { return (__m128i)((__v8hi)__A == (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_cmpeq_epi32(__m128i __A, __m128i __B) { return (__m128i)((__v4si)__A == (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_cmplt_epi8(__m128i __A, __m128i __B) { return (__m128i)((__v16qs)__A < (__v16qs)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_cmplt_epi16(__m128i __A, __m128i __B) { return (__m128i)((__v8hi)__A < (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_cmplt_epi32(__m128i __A, __m128i __B) { return (__m128i)((__v4si)__A < (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_cmpgt_epi8(__m128i __A, __m128i __B) { return (__m128i)((__v16qs)__A > (__v16qs)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_cmpgt_epi16(__m128i __A, __m128i __B) { return (__m128i)((__v8hi)__A > (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_cmpgt_epi32(__m128i __A, __m128i __B) { return (__m128i)((__v4si)__A > (__v4si)__B); } #ifdef __OPTIMIZE__ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi16(__m128i const __A, int const __N) { +__funline int _mm_extract_epi16(__m128i const __A, int const __N) { return (unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)__A, __N); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi16(__m128i const __A, int const __D, int const __N) { +__funline __m128i _mm_insert_epi16(__m128i const __A, int const __D, + int const __N) { return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)__A, __D, __N); } #else @@ -1302,58 +899,40 @@ extern __inline __m128i (int)(N))) #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_max_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_max_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_min_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminsw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_min_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pminub128((__v16qi)__A, (__v16qi)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movemask_epi8(__m128i __A) { +__funline int _mm_movemask_epi8(__m128i __A) { return __builtin_ia32_pmovmskb128((__v16qi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mulhi_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_mulhi_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__A, (__v8hi)__B); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shufflehi_epi16(__m128i __A, const int __mask) { +__funline __m128i _mm_shufflehi_epi16(__m128i __A, const int __mask) { return (__m128i)__builtin_ia32_pshufhw((__v8hi)__A, __mask); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shufflelo_epi16(__m128i __A, const int __mask) { +__funline __m128i _mm_shufflelo_epi16(__m128i __A, const int __mask) { return (__m128i)__builtin_ia32_pshuflw((__v8hi)__A, __mask); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shuffle_epi32(__m128i __A, const int __mask) { +__funline __m128i _mm_shuffle_epi32(__m128i __A, const int __mask) { return (__m128i)__builtin_ia32_pshufd((__v4si)__A, __mask); } #else @@ -1365,128 +944,88 @@ extern __inline __m128i ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(A), (int)(N))) #endif -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) { +__funline void _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) { __builtin_ia32_maskmovdqu((__v16qi)__A, (__v16qi)__B, __C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_avg_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_avg_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pavgb128((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_avg_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_avg_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pavgw128((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sad_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_sad_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psadbw128((__v16qi)__A, (__v16qi)__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_stream_si32(int *__A, int __B) { +__funline void _mm_stream_si32(int *__A, int __B) { __builtin_ia32_movnti(__A, __B); } #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_stream_si64(long long int *__A, long long int __B) { +__funline void _mm_stream_si64(long long int *__A, long long int __B) { __builtin_ia32_movnti64(__A, __B); } #endif -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_stream_si128(__m128i *__A, __m128i __B) { +__funline void _mm_stream_si128(__m128i *__A, __m128i __B) { __builtin_ia32_movntdq((__v2di *)__A, (__v2di)__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_stream_pd(double *__A, __m128d __B) { +__funline void _mm_stream_pd(double *__A, __m128d __B) { __builtin_ia32_movntpd(__A, (__v2df)__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_clflush(void const *__A) { +__funline void _mm_clflush(void const *__A) { __builtin_ia32_clflush(__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_lfence(void) { +__funline void _mm_lfence(void) { __builtin_ia32_lfence(); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mfence(void) { +__funline void _mm_mfence(void) { __builtin_ia32_mfence(); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi32_si128(int __A) { +__funline __m128i _mm_cvtsi32_si128(int __A) { return _mm_set_epi32(0, 0, 0, __A); } #ifdef __x86_64__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi64_si128(long long __A) { +__funline __m128i _mm_cvtsi64_si128(long long __A) { return _mm_set_epi64x(0, __A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi64x_si128(long long __A) { +__funline __m128i _mm_cvtsi64x_si128(long long __A) { return _mm_set_epi64x(0, __A); } #endif -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_castpd_ps(__m128d __A) { +__funline __m128 _mm_castpd_ps(__m128d __A) { return (__m128)__A; } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_castpd_si128(__m128d __A) { +__funline __m128i _mm_castpd_si128(__m128d __A) { return (__m128i)__A; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_castps_pd(__m128 __A) { +__funline __m128d _mm_castps_pd(__m128 __A) { return (__m128d)__A; } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_castps_si128(__m128 __A) { +__funline __m128i _mm_castps_si128(__m128 __A) { return (__m128i)__A; } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_castsi128_ps(__m128i __A) { +__funline __m128 _mm_castsi128_ps(__m128i __A) { return (__m128)__A; } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_castsi128_pd(__m128i __A) { +__funline __m128d _mm_castsi128_pd(__m128i __A) { return (__m128d)__A; } diff --git a/third_party/intel/f16cintrin.internal.h b/third_party/intel/f16cintrin.internal.h index e32ee703b..67337d68c 100644 --- a/third_party/intel/f16cintrin.internal.h +++ b/third_party/intel/f16cintrin.internal.h @@ -12,44 +12,32 @@ #define __DISABLE_F16C__ #endif /* __F16C__ */ -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cvtsh_ss(unsigned short __S) { +__funline float _cvtsh_ss(unsigned short __S) { __v8hi __H = __extension__(__v8hi){(short)__S, 0, 0, 0, 0, 0, 0, 0}; __v4sf __A = __builtin_ia32_vcvtph2ps(__H); return __builtin_ia32_vec_ext_v4sf(__A, 0); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtph_ps(__m128i __A) { +__funline __m128 _mm_cvtph_ps(__m128i __A) { return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__A); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtph_ps(__m128i __A) { +__funline __m256 _mm256_cvtph_ps(__m128i __A) { return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__A); } #ifdef __OPTIMIZE__ -extern __inline unsigned short - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _cvtss_sh(float __F, const int __I) { +__funline unsigned short _cvtss_sh(float __F, const int __I) { __v4sf __A = __extension__(__v4sf){__F, 0, 0, 0}; __v8hi __H = __builtin_ia32_vcvtps2ph(__A, __I); return (unsigned short)__builtin_ia32_vec_ext_v8hi(__H, 0); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtps_ph(__m128 __A, const int __I) { +__funline __m128i _mm_cvtps_ph(__m128 __A, const int __I) { return (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__A, __I); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_cvtps_ph(__m256 __A, const int __I) { +__funline __m128i _mm256_cvtps_ph(__m256 __A, const int __I) { return (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__A, __I); } #else diff --git a/third_party/intel/fma4intrin.internal.h b/third_party/intel/fma4intrin.internal.h index 535e63555..d2ed71ab4 100644 --- a/third_party/intel/fma4intrin.internal.h +++ b/third_party/intel/fma4intrin.internal.h @@ -13,229 +13,165 @@ #define __DISABLE_FMA4__ #endif /* __FMA4__ */ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) +__funline __m128 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } /* 256b Floating point multiply/add type instructions. */ -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) +__funline __m256 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } diff --git a/third_party/intel/fmaintrin.internal.h b/third_party/intel/fmaintrin.internal.h index bba5306c0..2b7daad60 100644 --- a/third_party/intel/fmaintrin.internal.h +++ b/third_party/intel/fmaintrin.internal.h @@ -11,224 +11,160 @@ #define __DISABLE_FMA__ #endif /* __FMA__ */ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmsubsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmsubss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfnmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfnmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfnmsubsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfnmsubss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) { +__funline __m128d _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) { +__funline __m256d _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) { +__funline __m128 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) { +__funline __m256 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } diff --git a/third_party/intel/fxsrintrin.internal.h b/third_party/intel/fxsrintrin.internal.h index 28fad84d3..30d15b154 100644 --- a/third_party/intel/fxsrintrin.internal.h +++ b/third_party/intel/fxsrintrin.internal.h @@ -11,28 +11,20 @@ #define __DISABLE_FXSR__ #endif /* __FXSR__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _fxsave(void *__P) { +__funline void _fxsave(void *__P) { __builtin_ia32_fxsave(__P); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _fxrstor(void *__P) { +__funline void _fxrstor(void *__P) { __builtin_ia32_fxrstor(__P); } #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _fxsave64(void *__P) { +__funline void _fxsave64(void *__P) { __builtin_ia32_fxsave64(__P); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _fxrstor64(void *__P) { +__funline void _fxrstor64(void *__P) { __builtin_ia32_fxrstor64(__P); } #endif diff --git a/third_party/intel/gfniintrin.internal.h b/third_party/intel/gfniintrin.internal.h index e19512c4c..1e345a0e9 100644 --- a/third_party/intel/gfniintrin.internal.h +++ b/third_party/intel/gfniintrin.internal.h @@ -11,23 +11,19 @@ #define __DISABLE_GFNI__ #endif /* __GFNI__ */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_gf2p8mul_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_gf2p8mul_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi((__v16qi)__A, (__v16qi)__B); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_gf2p8affineinv_epi64_epi8(__m128i __A, __m128i __B, const int __C) { +__funline __m128i _mm_gf2p8affineinv_epi64_epi8(__m128i __A, __m128i __B, + const int __C) { return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)__A, (__v16qi)__B, __C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_gf2p8affine_epi64_epi8(__m128i __A, __m128i __B, const int __C) { +__funline __m128i _mm_gf2p8affine_epi64_epi8(__m128i __A, __m128i __B, + const int __C) { return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)__A, (__v16qi)__B, __C); } @@ -51,23 +47,19 @@ extern __inline __m128i #define __DISABLE_GFNIAVX__ #endif /* __GFNIAVX__ */ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) { +__funline __m256i _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi((__v32qi)__A, (__v32qi)__B); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_gf2p8affineinv_epi64_epi8(__m256i __A, __m256i __B, const int __C) { +__funline __m256i _mm256_gf2p8affineinv_epi64_epi8(__m256i __A, __m256i __B, + const int __C) { return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)__A, (__v32qi)__B, __C); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_gf2p8affine_epi64_epi8(__m256i __A, __m256i __B, const int __C) { +__funline __m256i _mm256_gf2p8affine_epi64_epi8(__m256i __A, __m256i __B, + const int __C) { return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)__A, (__v32qi)__B, __C); } @@ -91,49 +83,42 @@ extern __inline __m256i #define __DISABLE_GFNIAVX512VL__ #endif /* __GFNIAVX512VL__ */ -extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_mask_gf2p8mul_epi8(__m128i __A, __mmask16 __B, __m128i __C, __m128i __D) { +__funline __m128i _mm_mask_gf2p8mul_epi8(__m128i __A, __mmask16 __B, __m128i __C, + __m128i __D) { return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi_mask( (__v16qi)__C, (__v16qi)__D, (__v16qi)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_gf2p8mul_epi8(__mmask16 __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_maskz_gf2p8mul_epi8(__mmask16 __A, __m128i __B, + __m128i __C) { return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi_mask( (__v16qi)__B, (__v16qi)__C, (__v16qi)_mm_setzero_si128(), __A); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_gf2p8affineinv_epi64_epi8(__m128i __A, __mmask16 __B, __m128i __C, - __m128i __D, const int __E) { +__funline __m128i _mm_mask_gf2p8affineinv_epi64_epi8(__m128i __A, __mmask16 __B, + __m128i __C, __m128i __D, + const int __E) { return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( (__v16qi)__C, (__v16qi)__D, __E, (__v16qi)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_gf2p8affineinv_epi64_epi8(__mmask16 __A, __m128i __B, __m128i __C, - const int __D) { +__funline __m128i _mm_maskz_gf2p8affineinv_epi64_epi8(__mmask16 __A, __m128i __B, + __m128i __C, + const int __D) { return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( (__v16qi)__B, (__v16qi)__C, __D, (__v16qi)_mm_setzero_si128(), __A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mask_gf2p8affine_epi64_epi8(__m128i __A, __mmask16 __B, __m128i __C, - __m128i __D, const int __E) { +__funline __m128i _mm_mask_gf2p8affine_epi64_epi8(__m128i __A, __mmask16 __B, + __m128i __C, __m128i __D, + const int __E) { return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( (__v16qi)__C, (__v16qi)__D, __E, (__v16qi)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskz_gf2p8affine_epi64_epi8(__mmask16 __A, __m128i __B, __m128i __C, - const int __D) { +__funline __m128i _mm_maskz_gf2p8affine_epi64_epi8(__mmask16 __A, __m128i __B, + __m128i __C, const int __D) { return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( (__v16qi)__B, (__v16qi)__C, __D, (__v16qi)_mm_setzero_si128(), __A); } @@ -167,51 +152,44 @@ extern __inline __m128i #define __DISABLE_GFNIAVX512VLBW__ #endif /* __GFNIAVX512VLBW__ */ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_gf2p8mul_epi8(__m256i __A, __mmask32 __B, __m256i __C, - __m256i __D) { +__funline __m256i _mm256_mask_gf2p8mul_epi8(__m256i __A, __mmask32 __B, + __m256i __C, __m256i __D) { return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi_mask( (__v32qi)__C, (__v32qi)__D, (__v32qi)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_gf2p8mul_epi8(__mmask32 __A, __m256i __B, __m256i __C) { +__funline __m256i _mm256_maskz_gf2p8mul_epi8(__mmask32 __A, __m256i __B, + __m256i __C) { return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi_mask( (__v32qi)__B, (__v32qi)__C, (__v32qi)_mm256_setzero_si256(), __A); } #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_gf2p8affineinv_epi64_epi8(__m256i __A, __mmask32 __B, - __m256i __C, __m256i __D, - const int __E) { +__funline __m256i _mm256_mask_gf2p8affineinv_epi64_epi8(__m256i __A, + __mmask32 __B, + __m256i __C, __m256i __D, + const int __E) { return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( (__v32qi)__C, (__v32qi)__D, __E, (__v32qi)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_gf2p8affineinv_epi64_epi8(__mmask32 __A, __m256i __B, - __m256i __C, const int __D) { +__funline __m256i _mm256_maskz_gf2p8affineinv_epi64_epi8(__mmask32 __A, + __m256i __B, __m256i __C, + const int __D) { return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( (__v32qi)__B, (__v32qi)__C, __D, (__v32qi)_mm256_setzero_si256(), __A); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_mask_gf2p8affine_epi64_epi8(__m256i __A, __mmask32 __B, __m256i __C, - __m256i __D, const int __E) { +__funline __m256i _mm256_mask_gf2p8affine_epi64_epi8(__m256i __A, __mmask32 __B, + __m256i __C, __m256i __D, + const int __E) { return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( (__v32qi)__C, (__v32qi)__D, __E, (__v32qi)__A, __B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_maskz_gf2p8affine_epi64_epi8(__mmask32 __A, __m256i __B, __m256i __C, - const int __D) { +__funline __m256i _mm256_maskz_gf2p8affine_epi64_epi8(__mmask32 __A, __m256i __B, + __m256i __C, + const int __D) { return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( (__v32qi)__B, (__v32qi)__C, __D, (__v32qi)_mm256_setzero_si256(), __A); } @@ -245,69 +223,58 @@ extern __inline __m256i #define __DISABLE_GFNIAVX512FBW__ #endif /* __GFNIAVX512FBW__ */ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_gf2p8mul_epi8(__m512i __A, __mmask64 __B, __m512i __C, - __m512i __D) { +__funline __m512i _mm512_mask_gf2p8mul_epi8(__m512i __A, __mmask64 __B, + __m512i __C, __m512i __D) { return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi_mask( (__v64qi)__C, (__v64qi)__D, (__v64qi)__A, __B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_gf2p8mul_epi8(__mmask64 __A, __m512i __B, __m512i __C) { +__funline __m512i _mm512_maskz_gf2p8mul_epi8(__mmask64 __A, __m512i __B, + __m512i __C) { return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi_mask( (__v64qi)__B, (__v64qi)__C, (__v64qi)_mm512_setzero_si512(), __A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) { +__funline __m512i _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi((__v64qi)__A, (__v64qi)__B); } #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_gf2p8affineinv_epi64_epi8(__m512i __A, __mmask64 __B, - __m512i __C, __m512i __D, - const int __E) { +__funline __m512i _mm512_mask_gf2p8affineinv_epi64_epi8(__m512i __A, + __mmask64 __B, + __m512i __C, __m512i __D, + const int __E) { return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( (__v64qi)__C, (__v64qi)__D, __E, (__v64qi)__A, __B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_gf2p8affineinv_epi64_epi8(__mmask64 __A, __m512i __B, - __m512i __C, const int __D) { +__funline __m512i _mm512_maskz_gf2p8affineinv_epi64_epi8(__mmask64 __A, + __m512i __B, __m512i __C, + const int __D) { return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( (__v64qi)__B, (__v64qi)__C, __D, (__v64qi)_mm512_setzero_si512(), __A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_gf2p8affineinv_epi64_epi8(__m512i __A, __m512i __B, const int __C) { +__funline __m512i _mm512_gf2p8affineinv_epi64_epi8(__m512i __A, __m512i __B, + const int __C) { return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)__A, (__v64qi)__B, __C); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_mask_gf2p8affine_epi64_epi8(__m512i __A, __mmask64 __B, __m512i __C, - __m512i __D, const int __E) { +__funline __m512i _mm512_mask_gf2p8affine_epi64_epi8(__m512i __A, __mmask64 __B, + __m512i __C, __m512i __D, + const int __E) { return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( (__v64qi)__C, (__v64qi)__D, __E, (__v64qi)__A, __B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_maskz_gf2p8affine_epi64_epi8(__mmask64 __A, __m512i __B, __m512i __C, - const int __D) { +__funline __m512i _mm512_maskz_gf2p8affine_epi64_epi8(__mmask64 __A, __m512i __B, + __m512i __C, + const int __D) { return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( (__v64qi)__B, (__v64qi)__C, __D, (__v64qi)_mm512_setzero_si512(), __A); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_gf2p8affine_epi64_epi8(__m512i __A, __m512i __B, const int __C) { +__funline __m512i _mm512_gf2p8affine_epi64_epi8(__m512i __A, __m512i __B, + const int __C) { return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)__A, (__v64qi)__B, __C); } diff --git a/third_party/intel/ia32intrin.internal.h b/third_party/intel/ia32intrin.internal.h index c20edb72b..f3d0193cb 100644 --- a/third_party/intel/ia32intrin.internal.h +++ b/third_party/intel/ia32intrin.internal.h @@ -2,21 +2,15 @@ #error "Never use directly; include instead." #endif -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __bsfd(int __X) { +__funline int __bsfd(int __X) { return __builtin_ctz(__X); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __bsrd(int __X) { +__funline int __bsrd(int __X) { return __builtin_ia32_bsrsi(__X); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __bswapd(int __X) { +__funline int __bswapd(int __X) { return __builtin_bswap32(__X); } @@ -28,21 +22,15 @@ extern __inline int #define __DISABLE_SSE4_2__ #endif /* __SSE4_2__ */ -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __crc32b(unsigned int __C, unsigned char __V) { +__funline unsigned int __crc32b(unsigned int __C, unsigned char __V) { return __builtin_ia32_crc32qi(__C, __V); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __crc32w(unsigned int __C, unsigned short __V) { +__funline unsigned int __crc32w(unsigned int __C, unsigned short __V) { return __builtin_ia32_crc32hi(__C, __V); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __crc32d(unsigned int __C, unsigned int __V) { +__funline unsigned int __crc32d(unsigned int __C, unsigned int __V) { return __builtin_ia32_crc32si(__C, __V); } @@ -53,99 +41,71 @@ extern __inline unsigned int #endif /* __iamcu__ */ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __popcntd(unsigned int __X) { +__funline int __popcntd(unsigned int __X) { return __builtin_popcount(__X); } #ifndef __iamcu__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rdpmc(int __S) { +__funline unsigned long long __rdpmc(int __S) { return __builtin_ia32_rdpmc(__S); } #endif /* __iamcu__ */ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rdtsc(void) { +__funline unsigned long long __rdtsc(void) { return __builtin_ia32_rdtsc(); } #ifndef __iamcu__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rdtscp(unsigned int *__A) { +__funline unsigned long long __rdtscp(unsigned int *__A) { return __builtin_ia32_rdtscp(__A); } #endif /* __iamcu__ */ -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rolb(unsigned char __X, int __C) { +__funline unsigned char __rolb(unsigned char __X, int __C) { return __builtin_ia32_rolqi(__X, __C); } -extern __inline unsigned short - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rolw(unsigned short __X, int __C) { +__funline unsigned short __rolw(unsigned short __X, int __C) { return __builtin_ia32_rolhi(__X, __C); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rold(unsigned int __X, int __C) { +__funline unsigned int __rold(unsigned int __X, int __C) { __C &= 31; return (__X << __C) | (__X >> (-__C & 31)); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rorb(unsigned char __X, int __C) { +__funline unsigned char __rorb(unsigned char __X, int __C) { return __builtin_ia32_rorqi(__X, __C); } -extern __inline unsigned short - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rorw(unsigned short __X, int __C) { +__funline unsigned short __rorw(unsigned short __X, int __C) { return __builtin_ia32_rorhi(__X, __C); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rord(unsigned int __X, int __C) { +__funline unsigned int __rord(unsigned int __X, int __C) { __C &= 31; return (__X >> __C) | (__X << (-__C & 31)); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __pause(void) { +__funline void __pause(void) { __builtin_ia32_pause(); } #ifdef __x86_64__ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __bsfq(long long __X) { +__funline int __bsfq(long long __X) { return __builtin_ctzll(__X); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __bsrq(long long __X) { +__funline int __bsrq(long long __X) { return __builtin_ia32_bsrdi(__X); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __bswapq(long long __X) { +__funline long long __bswapq(long long __X) { return __builtin_bswap64(__X); } @@ -155,9 +115,8 @@ extern __inline long long #define __DISABLE_SSE4_2__ #endif /* __SSE4_2__ */ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __crc32q(unsigned long long __C, unsigned long long __V) { +__funline unsigned long long __crc32q(unsigned long long __C, + unsigned long long __V) { return __builtin_ia32_crc32di(__C, __V); } @@ -166,35 +125,25 @@ extern __inline unsigned long long #pragma GCC pop_options #endif /* __DISABLE_SSE4_2__ */ -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __popcntq(unsigned long long __X) { +__funline long long __popcntq(unsigned long long __X) { return __builtin_popcountll(__X); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rolq(unsigned long long __X, int __C) { +__funline unsigned long long __rolq(unsigned long long __X, int __C) { __C &= 63; return (__X << __C) | (__X >> (-__C & 63)); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __rorq(unsigned long long __X, int __C) { +__funline unsigned long long __rorq(unsigned long long __X, int __C) { __C &= 63; return (__X >> __C) | (__X << (-__C & 63)); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __readeflags(void) { +__funline unsigned long long __readeflags(void) { return __builtin_ia32_readeflags_u64(); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __writeeflags(unsigned long long __X) { +__funline void __writeeflags(unsigned long long __X) { __builtin_ia32_writeeflags_u64(__X); } @@ -202,15 +151,11 @@ extern __inline void #define _popcnt64(a) __popcntq(a) #else -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __readeflags(void) { +__funline unsigned int __readeflags(void) { return __builtin_ia32_readeflags_u32(); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __writeeflags(unsigned int __X) { +__funline void __writeeflags(unsigned int __X) { __builtin_ia32_writeeflags_u32(__X); } diff --git a/third_party/intel/immintrin.internal.h b/third_party/intel/immintrin.internal.h index 4d3dd4b19..ca9f21f81 100644 --- a/third_party/intel/immintrin.internal.h +++ b/third_party/intel/immintrin.internal.h @@ -65,9 +65,7 @@ #include "third_party/intel/pkuintrin.internal.h" /* clang-format on */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _wbinvd(void) { +__funline void _wbinvd(void) { __builtin_ia32_wbinvd(); } @@ -76,15 +74,11 @@ extern __inline void #pragma GCC target("rdrnd") #define __DISABLE_RDRND__ #endif /* __RDRND__ */ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _rdrand16_step(unsigned short *__P) { +__funline int _rdrand16_step(unsigned short *__P) { return __builtin_ia32_rdrand16_step(__P); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _rdrand32_step(unsigned int *__P) { +__funline int _rdrand32_step(unsigned int *__P) { return __builtin_ia32_rdrand32_step(__P); } #ifdef __DISABLE_RDRND__ @@ -97,9 +91,7 @@ extern __inline int #pragma GCC target("rdpid") #define __DISABLE_RDPID__ #endif /* __RDPID__ */ -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _rdpid_u32(void) { +__funline unsigned int _rdpid_u32(void) { return __builtin_ia32_rdpid(); } #ifdef __DISABLE_RDPID__ @@ -114,51 +106,35 @@ extern __inline unsigned int #pragma GCC target("fsgsbase") #define __DISABLE_FSGSBASE__ #endif /* __FSGSBASE__ */ -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _readfsbase_u32(void) { +__funline unsigned int _readfsbase_u32(void) { return __builtin_ia32_rdfsbase32(); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _readfsbase_u64(void) { +__funline unsigned long long _readfsbase_u64(void) { return __builtin_ia32_rdfsbase64(); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _readgsbase_u32(void) { +__funline unsigned int _readgsbase_u32(void) { return __builtin_ia32_rdgsbase32(); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _readgsbase_u64(void) { +__funline unsigned long long _readgsbase_u64(void) { return __builtin_ia32_rdgsbase64(); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _writefsbase_u32(unsigned int __B) { +__funline void _writefsbase_u32(unsigned int __B) { __builtin_ia32_wrfsbase32(__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _writefsbase_u64(unsigned long long __B) { +__funline void _writefsbase_u64(unsigned long long __B) { __builtin_ia32_wrfsbase64(__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _writegsbase_u32(unsigned int __B) { +__funline void _writegsbase_u32(unsigned int __B) { __builtin_ia32_wrgsbase32(__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _writegsbase_u64(unsigned long long __B) { +__funline void _writegsbase_u64(unsigned long long __B) { __builtin_ia32_wrgsbase64(__B); } #ifdef __DISABLE_FSGSBASE__ @@ -171,9 +147,7 @@ extern __inline void #pragma GCC target("rdrnd") #define __DISABLE_RDRND__ #endif /* __RDRND__ */ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _rdrand64_step(unsigned long long *__P) { +__funline int _rdrand64_step(unsigned long long *__P) { return __builtin_ia32_rdrand64_step(__P); } #ifdef __DISABLE_RDRND__ @@ -190,16 +164,12 @@ extern __inline int #endif #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ptwrite64(unsigned long long __B) { +__funline void _ptwrite64(unsigned long long __B) { __builtin_ia32_ptwrite64(__B); } #endif /* __x86_64__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _ptwrite32(unsigned __B) { +__funline void _ptwrite32(unsigned __B) { __builtin_ia32_ptwrite32(__B); } #ifdef __DISABLE_PTWRITE__ diff --git a/third_party/intel/lwpintrin.internal.h b/third_party/intel/lwpintrin.internal.h index 58324b3a3..af776aebf 100644 --- a/third_party/intel/lwpintrin.internal.h +++ b/third_party/intel/lwpintrin.internal.h @@ -11,29 +11,23 @@ #define __DISABLE_LWP__ #endif /* __LWP__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __llwpcb(void *__pcbAddress) { +__funline void __llwpcb(void *__pcbAddress) { __builtin_ia32_llwpcb(__pcbAddress); } -extern __inline void *__attribute__((__gnu_inline__, __always_inline__, - __artificial__)) __slwpcb(void) { +__funline void *__slwpcb(void) { return __builtin_ia32_slwpcb(); } #ifdef __OPTIMIZE__ -extern __inline void __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -__lwpval32(unsigned int __data2, unsigned int __data1, unsigned int __flags) { +__funline void __lwpval32(unsigned int __data2, unsigned int __data1, + unsigned int __flags) { __builtin_ia32_lwpval32(__data2, __data1, __flags); } #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __lwpval64(unsigned long long __data2, unsigned int __data1, - unsigned int __flags) { +__funline void __lwpval64(unsigned long long __data2, unsigned int __data1, + unsigned int __flags) { __builtin_ia32_lwpval64(__data2, __data1, __flags); } #endif @@ -49,17 +43,14 @@ extern __inline void #endif #ifdef __OPTIMIZE__ -extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -__lwpins32(unsigned int __data2, unsigned int __data1, unsigned int __flags) { +__funline unsigned char __lwpins32(unsigned int __data2, unsigned int __data1, + unsigned int __flags) { return __builtin_ia32_lwpins32(__data2, __data1, __flags); } #ifdef __x86_64__ -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __lwpins64(unsigned long long __data2, unsigned int __data1, - unsigned int __flags) { +__funline unsigned char __lwpins64(unsigned long long __data2, + unsigned int __data1, unsigned int __flags) { return __builtin_ia32_lwpins64(__data2, __data1, __flags); } #endif diff --git a/third_party/intel/lzcntintrin.internal.h b/third_party/intel/lzcntintrin.internal.h index 30b01ec8b..e4a97090f 100644 --- a/third_party/intel/lzcntintrin.internal.h +++ b/third_party/intel/lzcntintrin.internal.h @@ -11,34 +11,24 @@ #define __DISABLE_LZCNT__ #endif /* __LZCNT__ */ -extern __inline unsigned short - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __lzcnt16(unsigned short __X) { +__funline unsigned short __lzcnt16(unsigned short __X) { return __builtin_ia32_lzcnt_u16(__X); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __lzcnt32(unsigned int __X) { +__funline unsigned int __lzcnt32(unsigned int __X) { return __builtin_ia32_lzcnt_u32(__X); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _lzcnt_u32(unsigned int __X) { +__funline unsigned int _lzcnt_u32(unsigned int __X) { return __builtin_ia32_lzcnt_u32(__X); } #ifdef __x86_64__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __lzcnt64(unsigned long long __X) { +__funline unsigned long long __lzcnt64(unsigned long long __X) { return __builtin_ia32_lzcnt_u64(__X); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _lzcnt_u64(unsigned long long __X) { +__funline unsigned long long _lzcnt_u64(unsigned long long __X) { return __builtin_ia32_lzcnt_u64(__X); } #endif diff --git a/third_party/intel/mm3dnow.internal.h b/third_party/intel/mm3dnow.internal.h index 7584a9327..6d278373e 100644 --- a/third_party/intel/mm3dnow.internal.h +++ b/third_party/intel/mm3dnow.internal.h @@ -14,141 +14,95 @@ #define __DISABLE_3dNOW__ #endif /* __3dNOW__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_femms(void) { +__funline void _m_femms(void) { __builtin_ia32_femms(); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pavgusb(__m64 __A, __m64 __B) { +__funline __m64 _m_pavgusb(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pavgusb((__v8qi)__A, (__v8qi)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pf2id(__m64 __A) { +__funline __m64 _m_pf2id(__m64 __A) { return (__m64)__builtin_ia32_pf2id((__v2sf)__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfacc(__m64 __A, __m64 __B) { +__funline __m64 _m_pfacc(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfacc((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfadd(__m64 __A, __m64 __B) { +__funline __m64 _m_pfadd(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfadd((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfcmpeq(__m64 __A, __m64 __B) { +__funline __m64 _m_pfcmpeq(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfcmpge(__m64 __A, __m64 __B) { +__funline __m64 _m_pfcmpge(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfcmpge((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfcmpgt(__m64 __A, __m64 __B) { +__funline __m64 _m_pfcmpgt(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfmax(__m64 __A, __m64 __B) { +__funline __m64 _m_pfmax(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfmax((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfmin(__m64 __A, __m64 __B) { +__funline __m64 _m_pfmin(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfmin((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfmul(__m64 __A, __m64 __B) { +__funline __m64 _m_pfmul(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfmul((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfrcp(__m64 __A) { +__funline __m64 _m_pfrcp(__m64 __A) { return (__m64)__builtin_ia32_pfrcp((__v2sf)__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfrcpit1(__m64 __A, __m64 __B) { +__funline __m64 _m_pfrcpit1(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfrcpit2(__m64 __A, __m64 __B) { +__funline __m64 _m_pfrcpit2(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfrsqrt(__m64 __A) { +__funline __m64 _m_pfrsqrt(__m64 __A) { return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfrsqit1(__m64 __A, __m64 __B) { +__funline __m64 _m_pfrsqit1(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfsub(__m64 __A, __m64 __B) { +__funline __m64 _m_pfsub(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfsub((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfsubr(__m64 __A, __m64 __B) { +__funline __m64 _m_pfsubr(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfsubr((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pi2fd(__m64 __A) { +__funline __m64 _m_pi2fd(__m64 __A) { return (__m64)__builtin_ia32_pi2fd((__v2si)__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pmulhrw(__m64 __A, __m64 __B) { +__funline __m64 _m_pmulhrw(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pmulhrw((__v4hi)__A, (__v4hi)__B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_prefetch(void *__P) { +__funline void _m_prefetch(void *__P) { __builtin_prefetch(__P, 0, 3 /* _MM_HINT_T0 */); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_from_float(float __A) { +__funline __m64 _m_from_float(float __A) { return __extension__(__m64)(__v2sf){__A, 0.0f}; } -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_to_float(__m64 __A) { +__funline float _m_to_float(__m64 __A) { union { __v2sf v; float a[2]; @@ -172,33 +126,23 @@ extern __inline float #define __DISABLE_3dNOW_A__ #endif /* __3dNOW_A__ */ -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pf2iw(__m64 __A) { +__funline __m64 _m_pf2iw(__m64 __A) { return (__m64)__builtin_ia32_pf2iw((__v2sf)__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfnacc(__m64 __A, __m64 __B) { +__funline __m64 _m_pfnacc(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfnacc((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pfpnacc(__m64 __A, __m64 __B) { +__funline __m64 _m_pfpnacc(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pfpnacc((__v2sf)__A, (__v2sf)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pi2fw(__m64 __A) { +__funline __m64 _m_pi2fw(__m64 __A) { return (__m64)__builtin_ia32_pi2fw((__v2si)__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pswapd(__m64 __A) { +__funline __m64 _m_pswapd(__m64 __A) { return (__m64)__builtin_ia32_pswapdsf((__v2sf)__A); } diff --git a/third_party/intel/mmintrin.internal.h b/third_party/intel/mmintrin.internal.h index 17a73a3db..58f114641 100644 --- a/third_party/intel/mmintrin.internal.h +++ b/third_party/intel/mmintrin.internal.h @@ -23,231 +23,157 @@ typedef char __v8qi __attribute__((__vector_size__(8))); typedef long long __v1di __attribute__((__vector_size__(8))); typedef float __v2sf __attribute__((__vector_size__(8))); -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_empty(void) { +__funline void _mm_empty(void) { __builtin_ia32_emms(); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_empty(void) { +__funline void _m_empty(void) { _mm_empty(); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi32_si64(int __i) { +__funline __m64 _mm_cvtsi32_si64(int __i) { return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_from_int(int __i) { +__funline __m64 _m_from_int(int __i) { return _mm_cvtsi32_si64(__i); } #ifdef __x86_64__ -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_from_int64(long long __i) { +__funline __m64 _m_from_int64(long long __i) { return (__m64)__i; } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi64_m64(long long __i) { +__funline __m64 _mm_cvtsi64_m64(long long __i) { return (__m64)__i; } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi64x_si64(long long __i) { +__funline __m64 _mm_cvtsi64x_si64(long long __i) { return (__m64)__i; } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_pi64x(long long __i) { +__funline __m64 _mm_set_pi64x(long long __i) { return (__m64)__i; } #endif -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi64_si32(__m64 __i) { +__funline int _mm_cvtsi64_si32(__m64 __i) { return __builtin_ia32_vec_ext_v2si((__v2si)__i, 0); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_to_int(__m64 __i) { +__funline int _m_to_int(__m64 __i) { return _mm_cvtsi64_si32(__i); } #ifdef __x86_64__ -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_to_int64(__m64 __i) { +__funline long long _m_to_int64(__m64 __i) { return (long long)__i; } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtm64_si64(__m64 __i) { +__funline long long _mm_cvtm64_si64(__m64 __i) { return (long long)__i; } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi64_si64x(__m64 __i) { +__funline long long _mm_cvtsi64_si64x(__m64 __i) { return (long long)__i; } #endif -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_packs_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_packs_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_packsswb(__m64 __m1, __m64 __m2) { +__funline __m64 _m_packsswb(__m64 __m1, __m64 __m2) { return _mm_packs_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_packs_pi32(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_packs_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_packssdw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_packssdw(__m64 __m1, __m64 __m2) { return _mm_packs_pi32(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_packs_pu16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_packs_pu16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_packuswb(__m64 __m1, __m64 __m2) { +__funline __m64 _m_packuswb(__m64 __m1, __m64 __m2) { return _mm_packs_pu16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_punpckhbw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_punpckhbw(__m64 __m1, __m64 __m2) { return _mm_unpackhi_pi8(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_punpckhwd(__m64 __m1, __m64 __m2) { +__funline __m64 _m_punpckhwd(__m64 __m1, __m64 __m2) { return _mm_unpackhi_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_punpckhdq(__m64 __m1, __m64 __m2) { +__funline __m64 _m_punpckhdq(__m64 __m1, __m64 __m2) { return _mm_unpackhi_pi32(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_punpcklbw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_punpcklbw(__m64 __m1, __m64 __m2) { return _mm_unpacklo_pi8(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_punpcklwd(__m64 __m1, __m64 __m2) { +__funline __m64 _m_punpcklwd(__m64 __m1, __m64 __m2) { return _mm_unpacklo_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_punpckldq(__m64 __m1, __m64 __m2) { +__funline __m64 _m_punpckldq(__m64 __m1, __m64 __m2) { return _mm_unpacklo_pi32(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_pi8(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_add_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_paddb(__m64 __m1, __m64 __m2) { +__funline __m64 _m_paddb(__m64 __m1, __m64 __m2) { return _mm_add_pi8(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_add_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_paddw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_paddw(__m64 __m1, __m64 __m2) { return _mm_add_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_pi32(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_add_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_paddd(__m64 __m1, __m64 __m2) { +__funline __m64 _m_paddd(__m64 __m1, __m64 __m2) { return _mm_add_pi32(__m1, __m2); } @@ -257,9 +183,7 @@ extern __inline __m64 #define __DISABLE_SSE2__ #endif /* __SSE2__ */ -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_si64(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_add_si64(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddq((__v1di)__m1, (__v1di)__m2); } #ifdef __DISABLE_SSE2__ @@ -267,87 +191,59 @@ extern __inline __m64 #pragma GCC pop_options #endif /* __DISABLE_SSE2__ */ -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_adds_pi8(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_adds_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_paddsb(__m64 __m1, __m64 __m2) { +__funline __m64 _m_paddsb(__m64 __m1, __m64 __m2) { return _mm_adds_pi8(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_adds_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_adds_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_paddsw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_paddsw(__m64 __m1, __m64 __m2) { return _mm_adds_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_adds_pu8(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_adds_pu8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_paddusb(__m64 __m1, __m64 __m2) { +__funline __m64 _m_paddusb(__m64 __m1, __m64 __m2) { return _mm_adds_pu8(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_adds_pu16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_adds_pu16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_paddusw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_paddusw(__m64 __m1, __m64 __m2) { return _mm_adds_pu16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_pi8(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_sub_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psubb(__m64 __m1, __m64 __m2) { +__funline __m64 _m_psubb(__m64 __m1, __m64 __m2) { return _mm_sub_pi8(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_sub_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psubw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_psubw(__m64 __m1, __m64 __m2) { return _mm_sub_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_pi32(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_sub_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psubd(__m64 __m1, __m64 __m2) { +__funline __m64 _m_psubd(__m64 __m1, __m64 __m2) { return _mm_sub_pi32(__m1, __m2); } @@ -357,9 +253,7 @@ extern __inline __m64 #define __DISABLE_SSE2__ #endif /* __SSE2__ */ -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_si64(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_sub_si64(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubq((__v1di)__m1, (__v1di)__m2); } #ifdef __DISABLE_SSE2__ @@ -367,462 +261,310 @@ extern __inline __m64 #pragma GCC pop_options #endif /* __DISABLE_SSE2__ */ -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_subs_pi8(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_subs_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psubsb(__m64 __m1, __m64 __m2) { +__funline __m64 _m_psubsb(__m64 __m1, __m64 __m2) { return _mm_subs_pi8(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_subs_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_subs_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psubsw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_psubsw(__m64 __m1, __m64 __m2) { return _mm_subs_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_subs_pu8(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_subs_pu8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psubusb(__m64 __m1, __m64 __m2) { +__funline __m64 _m_psubusb(__m64 __m1, __m64 __m2) { return _mm_subs_pu8(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_subs_pu16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_subs_pu16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psubusw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_psubusw(__m64 __m1, __m64 __m2) { return _mm_subs_pu16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_madd_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_madd_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pmaddwd(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pmaddwd(__m64 __m1, __m64 __m2) { return _mm_madd_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pmulhw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pmulhw(__m64 __m1, __m64 __m2) { return _mm_mulhi_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mullo_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_mullo_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pmullw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pmullw(__m64 __m1, __m64 __m2) { return _mm_mullo_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sll_pi16(__m64 __m, __m64 __count) { +__funline __m64 _mm_sll_pi16(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psllw((__v4hi)__m, (__v4hi)__count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psllw(__m64 __m, __m64 __count) { +__funline __m64 _m_psllw(__m64 __m, __m64 __count) { return _mm_sll_pi16(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_slli_pi16(__m64 __m, int __count) { +__funline __m64 _mm_slli_pi16(__m64 __m, int __count) { return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psllwi(__m64 __m, int __count) { +__funline __m64 _m_psllwi(__m64 __m, int __count) { return _mm_slli_pi16(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sll_pi32(__m64 __m, __m64 __count) { +__funline __m64 _mm_sll_pi32(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_pslld((__v2si)__m, (__v2si)__count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pslld(__m64 __m, __m64 __count) { +__funline __m64 _m_pslld(__m64 __m, __m64 __count) { return _mm_sll_pi32(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_slli_pi32(__m64 __m, int __count) { +__funline __m64 _mm_slli_pi32(__m64 __m, int __count) { return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pslldi(__m64 __m, int __count) { +__funline __m64 _m_pslldi(__m64 __m, int __count) { return _mm_slli_pi32(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sll_si64(__m64 __m, __m64 __count) { +__funline __m64 _mm_sll_si64(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psllq((__v1di)__m, (__v1di)__count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psllq(__m64 __m, __m64 __count) { +__funline __m64 _m_psllq(__m64 __m, __m64 __count) { return _mm_sll_si64(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_slli_si64(__m64 __m, int __count) { +__funline __m64 _mm_slli_si64(__m64 __m, int __count) { return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psllqi(__m64 __m, int __count) { +__funline __m64 _m_psllqi(__m64 __m, int __count) { return _mm_slli_si64(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sra_pi16(__m64 __m, __m64 __count) { +__funline __m64 _mm_sra_pi16(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psraw((__v4hi)__m, (__v4hi)__count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psraw(__m64 __m, __m64 __count) { +__funline __m64 _m_psraw(__m64 __m, __m64 __count) { return _mm_sra_pi16(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srai_pi16(__m64 __m, int __count) { +__funline __m64 _mm_srai_pi16(__m64 __m, int __count) { return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psrawi(__m64 __m, int __count) { +__funline __m64 _m_psrawi(__m64 __m, int __count) { return _mm_srai_pi16(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sra_pi32(__m64 __m, __m64 __count) { +__funline __m64 _mm_sra_pi32(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psrad((__v2si)__m, (__v2si)__count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psrad(__m64 __m, __m64 __count) { +__funline __m64 _m_psrad(__m64 __m, __m64 __count) { return _mm_sra_pi32(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srai_pi32(__m64 __m, int __count) { +__funline __m64 _mm_srai_pi32(__m64 __m, int __count) { return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psradi(__m64 __m, int __count) { +__funline __m64 _m_psradi(__m64 __m, int __count) { return _mm_srai_pi32(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srl_pi16(__m64 __m, __m64 __count) { +__funline __m64 _mm_srl_pi16(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psrlw((__v4hi)__m, (__v4hi)__count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psrlw(__m64 __m, __m64 __count) { +__funline __m64 _m_psrlw(__m64 __m, __m64 __count) { return _mm_srl_pi16(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srli_pi16(__m64 __m, int __count) { +__funline __m64 _mm_srli_pi16(__m64 __m, int __count) { return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psrlwi(__m64 __m, int __count) { +__funline __m64 _m_psrlwi(__m64 __m, int __count) { return _mm_srli_pi16(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srl_pi32(__m64 __m, __m64 __count) { +__funline __m64 _mm_srl_pi32(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psrld((__v2si)__m, (__v2si)__count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psrld(__m64 __m, __m64 __count) { +__funline __m64 _m_psrld(__m64 __m, __m64 __count) { return _mm_srl_pi32(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srli_pi32(__m64 __m, int __count) { +__funline __m64 _mm_srli_pi32(__m64 __m, int __count) { return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psrldi(__m64 __m, int __count) { +__funline __m64 _m_psrldi(__m64 __m, int __count) { return _mm_srli_pi32(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srl_si64(__m64 __m, __m64 __count) { +__funline __m64 _mm_srl_si64(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psrlq((__v1di)__m, (__v1di)__count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psrlq(__m64 __m, __m64 __count) { +__funline __m64 _m_psrlq(__m64 __m, __m64 __count) { return _mm_srl_si64(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_srli_si64(__m64 __m, int __count) { +__funline __m64 _mm_srli_si64(__m64 __m, int __count) { return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psrlqi(__m64 __m, int __count) { +__funline __m64 _m_psrlqi(__m64 __m, int __count) { return _mm_srli_si64(__m, __count); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_and_si64(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_and_si64(__m64 __m1, __m64 __m2) { return __builtin_ia32_pand(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pand(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pand(__m64 __m1, __m64 __m2) { return _mm_and_si64(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_andnot_si64(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_andnot_si64(__m64 __m1, __m64 __m2) { return __builtin_ia32_pandn(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pandn(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pandn(__m64 __m1, __m64 __m2) { return _mm_andnot_si64(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_or_si64(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_or_si64(__m64 __m1, __m64 __m2) { return __builtin_ia32_por(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_por(__m64 __m1, __m64 __m2) { +__funline __m64 _m_por(__m64 __m1, __m64 __m2) { return _mm_or_si64(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_xor_si64(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_xor_si64(__m64 __m1, __m64 __m2) { return __builtin_ia32_pxor(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pxor(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pxor(__m64 __m1, __m64 __m2) { return _mm_xor_si64(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pcmpeqb(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pcmpeqb(__m64 __m1, __m64 __m2) { return _mm_cmpeq_pi8(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pcmpgtb(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pcmpgtb(__m64 __m1, __m64 __m2) { return _mm_cmpgt_pi8(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pcmpeqw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pcmpeqw(__m64 __m1, __m64 __m2) { return _mm_cmpeq_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pcmpgtw(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pcmpgtw(__m64 __m1, __m64 __m2) { return _mm_cmpgt_pi16(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pcmpeqd(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pcmpeqd(__m64 __m1, __m64 __m2) { return _mm_cmpeq_pi32(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { +__funline __m64 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pcmpgtd(__m64 __m1, __m64 __m2) { +__funline __m64 _m_pcmpgtd(__m64 __m1, __m64 __m2) { return _mm_cmpgt_pi32(__m1, __m2); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setzero_si64(void) { +__funline __m64 _mm_setzero_si64(void) { return (__m64)0LL; } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_pi32(int __i1, int __i0) { +__funline __m64 _mm_set_pi32(int __i1, int __i0) { return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { +__funline __m64 _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { return (__m64)__builtin_ia32_vec_init_v4hi(__w0, __w1, __w2, __w3); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, - char __b2, char __b1, char __b0) { +__funline __m64 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, + char __b2, char __b1, char __b0) { return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setr_pi32(int __i0, int __i1) { +__funline __m64 _mm_setr_pi32(int __i0, int __i1) { return _mm_set_pi32(__i1, __i0); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { +__funline __m64 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { return _mm_set_pi16(__w3, __w2, __w1, __w0); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, - char __b5, char __b6, char __b7) { +__funline __m64 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, + char __b4, char __b5, char __b6, char __b7) { return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set1_pi32(int __i) { +__funline __m64 _mm_set1_pi32(int __i) { return _mm_set_pi32(__i, __i); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set1_pi16(short __w) { +__funline __m64 _mm_set1_pi16(short __w) { return _mm_set_pi16(__w, __w, __w, __w); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set1_pi8(char __b) { +__funline __m64 _mm_set1_pi8(char __b) { return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); } #ifdef __DISABLE_MMX__ diff --git a/third_party/intel/movdirintrin.internal.h b/third_party/intel/movdirintrin.internal.h index 72baaca64..ccf9bc58b 100644 --- a/third_party/intel/movdirintrin.internal.h +++ b/third_party/intel/movdirintrin.internal.h @@ -11,15 +11,11 @@ #define __DISABLE_MOVDIRI__ #endif /* __MOVDIRI__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _directstoreu_u32(void *__P, unsigned int __A) { +__funline void _directstoreu_u32(void *__P, unsigned int __A) { __builtin_ia32_directstoreu_u32((unsigned int *)__P, __A); } #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _directstoreu_u64(void *__P, unsigned long long __A) { +__funline void _directstoreu_u64(void *__P, unsigned long long __A) { __builtin_ia32_directstoreu_u64((unsigned long long *)__P, __A); } #endif @@ -35,9 +31,7 @@ extern __inline void #define __DISABLE_MOVDIR64B__ #endif /* __MOVDIR64B__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _movdir64b(void *__P, const void *__Q) { +__funline void _movdir64b(void *__P, const void *__Q) { __builtin_ia32_movdir64b(__P, __Q); } diff --git a/third_party/intel/mwaitxintrin.internal.h b/third_party/intel/mwaitxintrin.internal.h index 86685b6eb..0db3aa2c4 100644 --- a/third_party/intel/mwaitxintrin.internal.h +++ b/third_party/intel/mwaitxintrin.internal.h @@ -8,15 +8,11 @@ #define __DISABLE_MWAITX__ #endif /* __MWAITX__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_monitorx(void const* __P, unsigned int __E, unsigned int __H) { +__funline void _mm_monitorx(void const* __P, unsigned int __E, unsigned int __H) { __builtin_ia32_monitorx(__P, __E, __H); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mwaitx(unsigned int __E, unsigned int __H, unsigned int __C) { +__funline void _mm_mwaitx(unsigned int __E, unsigned int __H, unsigned int __C) { __builtin_ia32_mwaitx(__E, __H, __C); } diff --git a/third_party/intel/pconfigintrin.internal.h b/third_party/intel/pconfigintrin.internal.h index a38d9195a..c0877e16d 100644 --- a/third_party/intel/pconfigintrin.internal.h +++ b/third_party/intel/pconfigintrin.internal.h @@ -24,9 +24,7 @@ : "a"(leaf), "b"(b), "c"(c), "d"(d) \ : "cc") -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _pconfig_u32(const unsigned int __L, size_t __D[]) { +__funline unsigned int _pconfig_u32(const unsigned int __L, size_t __D[]) { enum __pconfig_type { __PCONFIG_KEY_PROGRAM = 0x01, }; diff --git a/third_party/intel/pkuintrin.internal.h b/third_party/intel/pkuintrin.internal.h index b2175da76..789c1335e 100644 --- a/third_party/intel/pkuintrin.internal.h +++ b/third_party/intel/pkuintrin.internal.h @@ -11,15 +11,11 @@ #define __DISABLE_PKU__ #endif /* __PKU__ */ -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _rdpkru_u32(void) { +__funline unsigned int _rdpkru_u32(void) { return __builtin_ia32_rdpkru(); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _wrpkru(unsigned int __key) { +__funline void _wrpkru(unsigned int __key) { __builtin_ia32_wrpkru(__key); } diff --git a/third_party/intel/pmmintrin.internal.h b/third_party/intel/pmmintrin.internal.h index f4946b254..27f9c5e41 100644 --- a/third_party/intel/pmmintrin.internal.h +++ b/third_party/intel/pmmintrin.internal.h @@ -17,81 +17,55 @@ _mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (mode)) #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_addsub_ps(__m128 __X, __m128 __Y) { +__funline __m128 _mm_addsub_ps(__m128 __X, __m128 __Y) { return (__m128)__builtin_ia32_addsubps((__v4sf)__X, (__v4sf)__Y); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hadd_ps(__m128 __X, __m128 __Y) { +__funline __m128 _mm_hadd_ps(__m128 __X, __m128 __Y) { return (__m128)__builtin_ia32_haddps((__v4sf)__X, (__v4sf)__Y); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsub_ps(__m128 __X, __m128 __Y) { +__funline __m128 _mm_hsub_ps(__m128 __X, __m128 __Y) { return (__m128)__builtin_ia32_hsubps((__v4sf)__X, (__v4sf)__Y); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movehdup_ps(__m128 __X) { +__funline __m128 _mm_movehdup_ps(__m128 __X) { return (__m128)__builtin_ia32_movshdup((__v4sf)__X); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_moveldup_ps(__m128 __X) { +__funline __m128 _mm_moveldup_ps(__m128 __X) { return (__m128)__builtin_ia32_movsldup((__v4sf)__X); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_addsub_pd(__m128d __X, __m128d __Y) { +__funline __m128d _mm_addsub_pd(__m128d __X, __m128d __Y) { return (__m128d)__builtin_ia32_addsubpd((__v2df)__X, (__v2df)__Y); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hadd_pd(__m128d __X, __m128d __Y) { +__funline __m128d _mm_hadd_pd(__m128d __X, __m128d __Y) { return (__m128d)__builtin_ia32_haddpd((__v2df)__X, (__v2df)__Y); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsub_pd(__m128d __X, __m128d __Y) { +__funline __m128d _mm_hsub_pd(__m128d __X, __m128d __Y) { return (__m128d)__builtin_ia32_hsubpd((__v2df)__X, (__v2df)__Y); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loaddup_pd(double const *__P) { +__funline __m128d _mm_loaddup_pd(double const *__P) { return _mm_load1_pd(__P); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movedup_pd(__m128d __X) { +__funline __m128d _mm_movedup_pd(__m128d __X) { return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0)); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_lddqu_si128(__m128i const *__P) { +__funline __m128i _mm_lddqu_si128(__m128i const *__P) { return (__m128i)__builtin_ia32_lddqu((char const *)__P); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_monitor(void const *__P, unsigned int __E, unsigned int __H) { +__funline void _mm_monitor(void const *__P, unsigned int __E, unsigned int __H) { __builtin_ia32_monitor(__P, __E, __H); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mwait(unsigned int __E, unsigned int __H) { +__funline void _mm_mwait(unsigned int __E, unsigned int __H) { __builtin_ia32_mwait(__E, __H); } diff --git a/third_party/intel/popcntintrin.internal.h b/third_party/intel/popcntintrin.internal.h index 07739a58f..8f18eb598 100644 --- a/third_party/intel/popcntintrin.internal.h +++ b/third_party/intel/popcntintrin.internal.h @@ -8,16 +8,12 @@ #define __DISABLE_POPCNT__ #endif /* __POPCNT__ */ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_popcnt_u32(unsigned int __X) { +__funline int _mm_popcnt_u32(unsigned int __X) { return __builtin_popcount(__X); } #ifdef __x86_64__ -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_popcnt_u64(unsigned long long __X) { +__funline long long _mm_popcnt_u64(unsigned long long __X) { return __builtin_popcountll(__X); } #endif diff --git a/third_party/intel/prfchwintrin.internal.h b/third_party/intel/prfchwintrin.internal.h index 8d727e960..66f911314 100644 --- a/third_party/intel/prfchwintrin.internal.h +++ b/third_party/intel/prfchwintrin.internal.h @@ -6,9 +6,7 @@ #ifndef _PRFCHWINTRIN_H_INCLUDED #define _PRFCHWINTRIN_H_INCLUDED -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_prefetchw(void *__P) { +__funline void _m_prefetchw(void *__P) { __builtin_prefetch(__P, 1, 3 /* _MM_HINT_T0 */); } diff --git a/third_party/intel/rdseedintrin.internal.h b/third_party/intel/rdseedintrin.internal.h index 6096637bc..c5125717e 100644 --- a/third_party/intel/rdseedintrin.internal.h +++ b/third_party/intel/rdseedintrin.internal.h @@ -11,22 +11,16 @@ #define __DISABLE_RDSEED__ #endif /* __RDSEED__ */ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _rdseed16_step(unsigned short *__p) { +__funline int _rdseed16_step(unsigned short *__p) { return __builtin_ia32_rdseed_hi_step(__p); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _rdseed32_step(unsigned int *__p) { +__funline int _rdseed32_step(unsigned int *__p) { return __builtin_ia32_rdseed_si_step(__p); } #ifdef __x86_64__ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _rdseed64_step(unsigned long long *__p) { +__funline int _rdseed64_step(unsigned long long *__p) { return __builtin_ia32_rdseed_di_step(__p); } #endif diff --git a/third_party/intel/rtmintrin.internal.h b/third_party/intel/rtmintrin.internal.h index 010588e60..370786179 100644 --- a/third_party/intel/rtmintrin.internal.h +++ b/third_party/intel/rtmintrin.internal.h @@ -20,22 +20,16 @@ #define _XABORT_NESTED (1 << 5) #define _XABORT_CODE(x) (((x) >> 24) & 0xFF) -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xbegin(void) { +__funline unsigned int _xbegin(void) { return __builtin_ia32_xbegin(); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xend(void) { +__funline void _xend(void) { __builtin_ia32_xend(); } #ifdef __OPTIMIZE__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xabort(const unsigned int __imm) { +__funline void _xabort(const unsigned int __imm) { __builtin_ia32_xabort(__imm); } #else diff --git a/third_party/intel/sgxintrin.internal.h b/third_party/intel/sgxintrin.internal.h index 29d2877d4..3e5955943 100644 --- a/third_party/intel/sgxintrin.internal.h +++ b/third_party/intel/sgxintrin.internal.h @@ -80,9 +80,7 @@ : "a"(leaf), "b"(b), "c"(c), "d"(d) \ : "cc") -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _encls_u32(const unsigned int __L, size_t __D[]) { +__funline unsigned int _encls_u32(const unsigned int __L, size_t __D[]) { enum __encls_type { __SGX_ECREATE = 0x00, __SGX_EADD = 0x01, @@ -145,9 +143,7 @@ extern __inline unsigned int return __R; } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _enclu_u32(const unsigned int __L, size_t __D[]) { +__funline unsigned int _enclu_u32(const unsigned int __L, size_t __D[]) { enum __enclu_type { __SGX_EREPORT = 0x00, __SGX_EGETKEY = 0x01, @@ -186,9 +182,7 @@ extern __inline unsigned int return __R; } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _enclv_u32(const unsigned int __L, size_t __D[]) { +__funline unsigned int _enclv_u32(const unsigned int __L, size_t __D[]) { enum __enclv_type { __SGX_EDECVIRTCHILD = 0x00, __SGX_EINCVIRTCHILD = 0x01, diff --git a/third_party/intel/shaintrin.internal.h b/third_party/intel/shaintrin.internal.h index 05446f46e..1d3a6c139 100644 --- a/third_party/intel/shaintrin.internal.h +++ b/third_party/intel/shaintrin.internal.h @@ -11,28 +11,20 @@ #define __DISABLE_SHA__ #endif /* __SHA__ */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha1msg1_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_sha1msg1_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_sha1msg1((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha1msg2_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_sha1msg2_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_sha1msg2((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha1nexte_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_sha1nexte_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_sha1nexte((__v4si)__A, (__v4si)__B); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha1rnds4_epu32(__m128i __A, __m128i __B, const int __I) { +__funline __m128i _mm_sha1rnds4_epu32(__m128i __A, __m128i __B, const int __I) { return (__m128i)__builtin_ia32_sha1rnds4((__v4si)__A, (__v4si)__B, __I); } #else @@ -41,21 +33,15 @@ extern __inline __m128i (int)I)) #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha256msg1_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_sha256msg1_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_sha256msg1((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha256msg2_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_sha256msg2_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_sha256msg2((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha256rnds2_epu32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_sha256rnds2_epu32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__A, (__v4si)__B, (__v4si)__C); } diff --git a/third_party/intel/smmintrin.internal.h b/third_party/intel/smmintrin.internal.h index 18f3de4ac..022fdb427 100644 --- a/third_party/intel/smmintrin.internal.h +++ b/third_party/intel/smmintrin.internal.h @@ -25,21 +25,15 @@ #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testz_si128(__m128i __M, __m128i __V) { +__funline int _mm_testz_si128(__m128i __M, __m128i __V) { return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testc_si128(__m128i __M, __m128i __V) { +__funline int _mm_testc_si128(__m128i __M, __m128i __V) { return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_testnzc_si128(__m128i __M, __m128i __V) { +__funline int _mm_testnzc_si128(__m128i __M, __m128i __V) { return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); } @@ -50,15 +44,11 @@ extern __inline int #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_round_pd(__m128d __V, const int __M) { +__funline __m128d _mm_round_pd(__m128d __V, const int __M) { return (__m128d)__builtin_ia32_roundpd((__v2df)__V, __M); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_round_sd(__m128d __D, __m128d __V, const int __M) { +__funline __m128d _mm_round_sd(__m128d __D, __m128d __V, const int __M) { return (__m128d)__builtin_ia32_roundsd((__v2df)__D, (__v2df)__V, __M); } #else @@ -71,15 +61,11 @@ extern __inline __m128d #endif #ifdef __OPTIMIZE__ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_round_ps(__m128 __V, const int __M) { +__funline __m128 _mm_round_ps(__m128 __V, const int __M) { return (__m128)__builtin_ia32_roundps((__v4sf)__V, __M); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_round_ss(__m128 __D, __m128 __V, const int __M) { +__funline __m128 _mm_round_ss(__m128 __D, __m128 __V, const int __M) { return (__m128)__builtin_ia32_roundss((__v4sf)__D, (__v4sf)__V, __M); } #else @@ -104,9 +90,7 @@ extern __inline __m128 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR) #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blend_epi16(__m128i __X, __m128i __Y, const int __M) { +__funline __m128i _mm_blend_epi16(__m128i __X, __m128i __Y, const int __M) { return (__m128i)__builtin_ia32_pblendw128((__v8hi)__X, (__v8hi)__Y, __M); } #else @@ -115,17 +99,13 @@ extern __inline __m128i (__v8hi)(__m128i)(Y), (int)(M))) #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blendv_epi8(__m128i __X, __m128i __Y, __m128i __M) { +__funline __m128i _mm_blendv_epi8(__m128i __X, __m128i __Y, __m128i __M) { return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__X, (__v16qi)__Y, (__v16qi)__M); } #ifdef __OPTIMIZE__ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blend_ps(__m128 __X, __m128 __Y, const int __M) { +__funline __m128 _mm_blend_ps(__m128 __X, __m128 __Y, const int __M) { return (__m128)__builtin_ia32_blendps((__v4sf)__X, (__v4sf)__Y, __M); } #else @@ -134,16 +114,12 @@ extern __inline __m128 (int)(M))) #endif -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blendv_ps(__m128 __X, __m128 __Y, __m128 __M) { +__funline __m128 _mm_blendv_ps(__m128 __X, __m128 __Y, __m128 __M) { return (__m128)__builtin_ia32_blendvps((__v4sf)__X, (__v4sf)__Y, (__v4sf)__M); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blend_pd(__m128d __X, __m128d __Y, const int __M) { +__funline __m128d _mm_blend_pd(__m128d __X, __m128d __Y, const int __M) { return (__m128d)__builtin_ia32_blendpd((__v2df)__X, (__v2df)__Y, __M); } #else @@ -152,23 +128,17 @@ extern __inline __m128d (int)(M))) #endif -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blendv_pd(__m128d __X, __m128d __Y, __m128d __M) { +__funline __m128d _mm_blendv_pd(__m128d __X, __m128d __Y, __m128d __M) { return (__m128d)__builtin_ia32_blendvpd((__v2df)__X, (__v2df)__Y, (__v2df)__M); } #ifdef __OPTIMIZE__ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_dp_ps(__m128 __X, __m128 __Y, const int __M) { +__funline __m128 _mm_dp_ps(__m128 __X, __m128 __Y, const int __M) { return (__m128)__builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, __M); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_dp_pd(__m128d __X, __m128d __Y, const int __M) { +__funline __m128d _mm_dp_pd(__m128d __X, __m128d __Y, const int __M) { return (__m128d)__builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, __M); } #else @@ -181,76 +151,52 @@ extern __inline __m128d (int)(M))) #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_epi64(__m128i __X, __m128i __Y) { +__funline __m128i _mm_cmpeq_epi64(__m128i __X, __m128i __Y) { return (__m128i)((__v2di)__X == (__v2di)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_epi8(__m128i __X, __m128i __Y) { +__funline __m128i _mm_min_epi8(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pminsb128((__v16qi)__X, (__v16qi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_epi8(__m128i __X, __m128i __Y) { +__funline __m128i _mm_max_epi8(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pmaxsb128((__v16qi)__X, (__v16qi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_epu16(__m128i __X, __m128i __Y) { +__funline __m128i _mm_min_epu16(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pminuw128((__v8hi)__X, (__v8hi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_epu16(__m128i __X, __m128i __Y) { +__funline __m128i _mm_max_epu16(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pmaxuw128((__v8hi)__X, (__v8hi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_min_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pminsd128((__v4si)__X, (__v4si)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_max_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pmaxsd128((__v4si)__X, (__v4si)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_epu32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_min_epu32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pminud128((__v4si)__X, (__v4si)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_epu32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_max_epu32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pmaxud128((__v4si)__X, (__v4si)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mullo_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_mullo_epi32(__m128i __X, __m128i __Y) { return (__m128i)((__v4su)__X * (__v4su)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mul_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_mul_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pmuldq128((__v4si)__X, (__v4si)__Y); } #ifdef __OPTIMIZE__ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_ps(__m128 __D, __m128 __S, const int __N) { +__funline __m128 _mm_insert_ps(__m128 __D, __m128 __S, const int __N) { return (__m128)__builtin_ia32_insertps128((__v4sf)__D, (__v4sf)__S, __N); } #else @@ -262,9 +208,7 @@ extern __inline __m128 #define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M)) #ifdef __OPTIMIZE__ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_ps(__m128 __X, const int __N) { +__funline int _mm_extract_ps(__m128 __X, const int __N) { union { int i; float f; @@ -291,22 +235,16 @@ extern __inline int _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi8(__m128i __D, int __S, const int __N) { +__funline __m128i _mm_insert_epi8(__m128i __D, int __S, const int __N) { return (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)__D, __S, __N); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi32(__m128i __D, int __S, const int __N) { +__funline __m128i _mm_insert_epi32(__m128i __D, int __S, const int __N) { return (__m128i)__builtin_ia32_vec_set_v4si((__v4si)__D, __S, __N); } #ifdef __x86_64__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi64(__m128i __D, long long __S, const int __N) { +__funline __m128i _mm_insert_epi64(__m128i __D, long long __S, const int __N) { return (__m128i)__builtin_ia32_vec_set_v2di((__v2di)__D, __S, __N); } #endif @@ -327,22 +265,16 @@ extern __inline __m128i #endif #ifdef __OPTIMIZE__ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi8(__m128i __X, const int __N) { +__funline int _mm_extract_epi8(__m128i __X, const int __N) { return (unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)__X, __N); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi32(__m128i __X, const int __N) { +__funline int _mm_extract_epi32(__m128i __X, const int __N) { return __builtin_ia32_vec_ext_v4si((__v4si)__X, __N); } #ifdef __x86_64__ -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi64(__m128i __X, const int __N) { +__funline long long _mm_extract_epi64(__m128i __X, const int __N) { return __builtin_ia32_vec_ext_v2di((__v2di)__X, __N); } #endif @@ -359,94 +291,64 @@ extern __inline long long #endif #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_minpos_epu16(__m128i __X) { +__funline __m128i _mm_minpos_epu16(__m128i __X) { return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi8_epi32(__m128i __X) { +__funline __m128i _mm_cvtepi8_epi32(__m128i __X) { return (__m128i)__builtin_ia32_pmovsxbd128((__v16qi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi16_epi32(__m128i __X) { +__funline __m128i _mm_cvtepi16_epi32(__m128i __X) { return (__m128i)__builtin_ia32_pmovsxwd128((__v8hi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi8_epi64(__m128i __X) { +__funline __m128i _mm_cvtepi8_epi64(__m128i __X) { return (__m128i)__builtin_ia32_pmovsxbq128((__v16qi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi32_epi64(__m128i __X) { +__funline __m128i _mm_cvtepi32_epi64(__m128i __X) { return (__m128i)__builtin_ia32_pmovsxdq128((__v4si)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi16_epi64(__m128i __X) { +__funline __m128i _mm_cvtepi16_epi64(__m128i __X) { return (__m128i)__builtin_ia32_pmovsxwq128((__v8hi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepi8_epi16(__m128i __X) { +__funline __m128i _mm_cvtepi8_epi16(__m128i __X) { return (__m128i)__builtin_ia32_pmovsxbw128((__v16qi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepu8_epi32(__m128i __X) { +__funline __m128i _mm_cvtepu8_epi32(__m128i __X) { return (__m128i)__builtin_ia32_pmovzxbd128((__v16qi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepu16_epi32(__m128i __X) { +__funline __m128i _mm_cvtepu16_epi32(__m128i __X) { return (__m128i)__builtin_ia32_pmovzxwd128((__v8hi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepu8_epi64(__m128i __X) { +__funline __m128i _mm_cvtepu8_epi64(__m128i __X) { return (__m128i)__builtin_ia32_pmovzxbq128((__v16qi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepu32_epi64(__m128i __X) { +__funline __m128i _mm_cvtepu32_epi64(__m128i __X) { return (__m128i)__builtin_ia32_pmovzxdq128((__v4si)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepu16_epi64(__m128i __X) { +__funline __m128i _mm_cvtepu16_epi64(__m128i __X) { return (__m128i)__builtin_ia32_pmovzxwq128((__v8hi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtepu8_epi16(__m128i __X) { +__funline __m128i _mm_cvtepu8_epi16(__m128i __X) { return (__m128i)__builtin_ia32_pmovzxbw128((__v16qi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_packus_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_packus_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_packusdw128((__v4si)__X, (__v4si)__Y); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mpsadbw_epu8(__m128i __X, __m128i __Y, const int __M) { +__funline __m128i _mm_mpsadbw_epu8(__m128i __X, __m128i __Y, const int __M) { return (__m128i)__builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, __M); } #else @@ -455,9 +357,7 @@ extern __inline __m128i (__v16qi)(__m128i)(Y), (int)(M))) #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_stream_load_si128(__m128i *__X) { +__funline __m128i _mm_stream_load_si128(__m128i *__X) { return (__m128i)__builtin_ia32_movntdqa((__v2di *)__X); } @@ -489,28 +389,22 @@ extern __inline __m128i #define _SIDD_UNIT_MASK 0x40 #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpistrm(__m128i __X, __m128i __Y, const int __M) { +__funline __m128i _mm_cmpistrm(__m128i __X, __m128i __Y, const int __M) { return (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)__X, (__v16qi)__Y, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpistri(__m128i __X, __m128i __Y, const int __M) { +__funline int _mm_cmpistri(__m128i __X, __m128i __Y, const int __M) { return __builtin_ia32_pcmpistri128((__v16qi)__X, (__v16qi)__Y, __M); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpestrm(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { +__funline __m128i _mm_cmpestrm(__m128i __X, int __LX, __m128i __Y, int __LY, + const int __M) { return (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpestri(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { +__funline int _mm_cmpestri(__m128i __X, int __LX, __m128i __Y, int __LY, + const int __M) { return __builtin_ia32_pcmpestri128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, __M); } @@ -533,67 +427,52 @@ extern __inline int #endif #ifdef __OPTIMIZE__ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpistra(__m128i __X, __m128i __Y, const int __M) { +__funline int _mm_cmpistra(__m128i __X, __m128i __Y, const int __M) { return __builtin_ia32_pcmpistria128((__v16qi)__X, (__v16qi)__Y, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpistrc(__m128i __X, __m128i __Y, const int __M) { +__funline int _mm_cmpistrc(__m128i __X, __m128i __Y, const int __M) { return __builtin_ia32_pcmpistric128((__v16qi)__X, (__v16qi)__Y, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpistro(__m128i __X, __m128i __Y, const int __M) { +__funline int _mm_cmpistro(__m128i __X, __m128i __Y, const int __M) { return __builtin_ia32_pcmpistrio128((__v16qi)__X, (__v16qi)__Y, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpistrs(__m128i __X, __m128i __Y, const int __M) { +__funline int _mm_cmpistrs(__m128i __X, __m128i __Y, const int __M) { return __builtin_ia32_pcmpistris128((__v16qi)__X, (__v16qi)__Y, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpistrz(__m128i __X, __m128i __Y, const int __M) { +__funline int _mm_cmpistrz(__m128i __X, __m128i __Y, const int __M) { return __builtin_ia32_pcmpistriz128((__v16qi)__X, (__v16qi)__Y, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpestra(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { +__funline int _mm_cmpestra(__m128i __X, int __LX, __m128i __Y, int __LY, + const int __M) { return __builtin_ia32_pcmpestria128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpestrc(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { +__funline int _mm_cmpestrc(__m128i __X, int __LX, __m128i __Y, int __LY, + const int __M) { return __builtin_ia32_pcmpestric128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpestro(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { +__funline int _mm_cmpestro(__m128i __X, int __LX, __m128i __Y, int __LY, + const int __M) { return __builtin_ia32_pcmpestrio128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpestrs(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { +__funline int _mm_cmpestrs(__m128i __X, int __LX, __m128i __Y, int __LY, + const int __M) { return __builtin_ia32_pcmpestris128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, __M); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpestrz(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) { +__funline int _mm_cmpestrz(__m128i __X, int __LX, __m128i __Y, int __LY, + const int __M) { return __builtin_ia32_pcmpestriz128((__v16qi)__X, __LX, (__v16qi)__Y, __LY, __M); } @@ -636,9 +515,7 @@ extern __inline int (int)(M))) #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_epi64(__m128i __X, __m128i __Y) { +__funline __m128i _mm_cmpgt_epi64(__m128i __X, __m128i __Y) { return (__m128i)((__v2di)__X > (__v2di)__Y); } @@ -667,28 +544,21 @@ extern __inline __m128i #endif /* __SSE4_1__ */ /* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */ -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_crc32_u8(unsigned int __C, unsigned char __V) { +__funline unsigned int _mm_crc32_u8(unsigned int __C, unsigned char __V) { return __builtin_ia32_crc32qi(__C, __V); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_crc32_u16(unsigned int __C, unsigned short __V) { +__funline unsigned int _mm_crc32_u16(unsigned int __C, unsigned short __V) { return __builtin_ia32_crc32hi(__C, __V); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_crc32_u32(unsigned int __C, unsigned int __V) { +__funline unsigned int _mm_crc32_u32(unsigned int __C, unsigned int __V) { return __builtin_ia32_crc32si(__C, __V); } #ifdef __x86_64__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_crc32_u64(unsigned long long __C, unsigned long long __V) { +__funline unsigned long long _mm_crc32_u64(unsigned long long __C, + unsigned long long __V) { return __builtin_ia32_crc32di(__C, __V); } #endif diff --git a/third_party/intel/tbmintrin.internal.h b/third_party/intel/tbmintrin.internal.h index a1c93feec..d740e2274 100644 --- a/third_party/intel/tbmintrin.internal.h +++ b/third_party/intel/tbmintrin.internal.h @@ -12,9 +12,7 @@ #endif /* __TBM__ */ #ifdef __OPTIMIZE__ -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __bextri_u32(unsigned int __X, const unsigned int __I) { +__funline unsigned int __bextri_u32(unsigned int __X, const unsigned int __I) { return __builtin_ia32_bextri_u32(__X, __I); } #else @@ -23,65 +21,46 @@ extern __inline unsigned int (unsigned int)(I))) #endif /*__OPTIMIZE__ */ -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blcfill_u32(unsigned int __X) { +__funline unsigned int __blcfill_u32(unsigned int __X) { return __X & (__X + 1); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blci_u32(unsigned int __X) { +__funline unsigned int __blci_u32(unsigned int __X) { return __X | ~(__X + 1); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blcic_u32(unsigned int __X) { +__funline unsigned int __blcic_u32(unsigned int __X) { return ~__X & (__X + 1); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blcmsk_u32(unsigned int __X) { +__funline unsigned int __blcmsk_u32(unsigned int __X) { return __X ^ (__X + 1); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blcs_u32(unsigned int __X) { +__funline unsigned int __blcs_u32(unsigned int __X) { return __X | (__X + 1); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blsfill_u32(unsigned int __X) { +__funline unsigned int __blsfill_u32(unsigned int __X) { return __X | (__X - 1); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blsic_u32(unsigned int __X) { +__funline unsigned int __blsic_u32(unsigned int __X) { return ~__X | (__X - 1); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __t1mskc_u32(unsigned int __X) { +__funline unsigned int __t1mskc_u32(unsigned int __X) { return ~__X | (__X + 1); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __tzmsk_u32(unsigned int __X) { +__funline unsigned int __tzmsk_u32(unsigned int __X) { return ~__X & (__X - 1); } #ifdef __x86_64__ #ifdef __OPTIMIZE__ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __bextri_u64(unsigned long long __X, const unsigned int __I) { +__funline unsigned long long __bextri_u64(unsigned long long __X, + const unsigned int __I) { return __builtin_ia32_bextri_u64(__X, __I); } #else @@ -90,57 +69,39 @@ extern __inline unsigned long long (unsigned long long)(I))) #endif /*__OPTIMIZE__ */ -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blcfill_u64(unsigned long long __X) { +__funline unsigned long long __blcfill_u64(unsigned long long __X) { return __X & (__X + 1); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blci_u64(unsigned long long __X) { +__funline unsigned long long __blci_u64(unsigned long long __X) { return __X | ~(__X + 1); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blcic_u64(unsigned long long __X) { +__funline unsigned long long __blcic_u64(unsigned long long __X) { return ~__X & (__X + 1); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blcmsk_u64(unsigned long long __X) { +__funline unsigned long long __blcmsk_u64(unsigned long long __X) { return __X ^ (__X + 1); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blcs_u64(unsigned long long __X) { +__funline unsigned long long __blcs_u64(unsigned long long __X) { return __X | (__X + 1); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blsfill_u64(unsigned long long __X) { +__funline unsigned long long __blsfill_u64(unsigned long long __X) { return __X | (__X - 1); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __blsic_u64(unsigned long long __X) { +__funline unsigned long long __blsic_u64(unsigned long long __X) { return ~__X | (__X - 1); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __t1mskc_u64(unsigned long long __X) { +__funline unsigned long long __t1mskc_u64(unsigned long long __X) { return ~__X | (__X + 1); } -extern __inline unsigned long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - __tzmsk_u64(unsigned long long __X) { +__funline unsigned long long __tzmsk_u64(unsigned long long __X) { return ~__X & (__X - 1); } diff --git a/third_party/intel/tmmintrin.internal.h b/third_party/intel/tmmintrin.internal.h index 2c01c9265..7f56e25d5 100644 --- a/third_party/intel/tmmintrin.internal.h +++ b/third_party/intel/tmmintrin.internal.h @@ -9,160 +9,108 @@ #define __DISABLE_SSSE3__ #endif /* __SSSE3__ */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hadd_epi16(__m128i __X, __m128i __Y) { +__funline __m128i _mm_hadd_epi16(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_phaddw128((__v8hi)__X, (__v8hi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hadd_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_hadd_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_phaddd128((__v4si)__X, (__v4si)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hadds_epi16(__m128i __X, __m128i __Y) { +__funline __m128i _mm_hadds_epi16(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__X, (__v8hi)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hadd_pi16(__m64 __X, __m64 __Y) { +__funline __m64 _mm_hadd_pi16(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_phaddw((__v4hi)__X, (__v4hi)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hadd_pi32(__m64 __X, __m64 __Y) { +__funline __m64 _mm_hadd_pi32(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_phaddd((__v2si)__X, (__v2si)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hadds_pi16(__m64 __X, __m64 __Y) { +__funline __m64 _mm_hadds_pi16(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_phaddsw((__v4hi)__X, (__v4hi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsub_epi16(__m128i __X, __m128i __Y) { +__funline __m128i _mm_hsub_epi16(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_phsubw128((__v8hi)__X, (__v8hi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsub_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_hsub_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_phsubd128((__v4si)__X, (__v4si)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsubs_epi16(__m128i __X, __m128i __Y) { +__funline __m128i _mm_hsubs_epi16(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__X, (__v8hi)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsub_pi16(__m64 __X, __m64 __Y) { +__funline __m64 _mm_hsub_pi16(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_phsubw((__v4hi)__X, (__v4hi)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsub_pi32(__m64 __X, __m64 __Y) { +__funline __m64 _mm_hsub_pi32(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_phsubd((__v2si)__X, (__v2si)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsubs_pi16(__m64 __X, __m64 __Y) { +__funline __m64 _mm_hsubs_pi16(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_phsubsw((__v4hi)__X, (__v4hi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maddubs_epi16(__m128i __X, __m128i __Y) { +__funline __m128i _mm_maddubs_epi16(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__X, (__v16qi)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maddubs_pi16(__m64 __X, __m64 __Y) { +__funline __m64 _mm_maddubs_pi16(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__X, (__v8qi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mulhrs_epi16(__m128i __X, __m128i __Y) { +__funline __m128i _mm_mulhrs_epi16(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__X, (__v8hi)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mulhrs_pi16(__m64 __X, __m64 __Y) { +__funline __m64 _mm_mulhrs_pi16(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__X, (__v4hi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shuffle_epi8(__m128i __X, __m128i __Y) { +__funline __m128i _mm_shuffle_epi8(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_pshufb128((__v16qi)__X, (__v16qi)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shuffle_pi8(__m64 __X, __m64 __Y) { +__funline __m64 _mm_shuffle_pi8(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_pshufb((__v8qi)__X, (__v8qi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sign_epi8(__m128i __X, __m128i __Y) { +__funline __m128i _mm_sign_epi8(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psignb128((__v16qi)__X, (__v16qi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sign_epi16(__m128i __X, __m128i __Y) { +__funline __m128i _mm_sign_epi16(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psignw128((__v8hi)__X, (__v8hi)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sign_epi32(__m128i __X, __m128i __Y) { +__funline __m128i _mm_sign_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psignd128((__v4si)__X, (__v4si)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sign_pi8(__m64 __X, __m64 __Y) { +__funline __m64 _mm_sign_pi8(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_psignb((__v8qi)__X, (__v8qi)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sign_pi16(__m64 __X, __m64 __Y) { +__funline __m64 _mm_sign_pi16(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_psignw((__v4hi)__X, (__v4hi)__Y); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sign_pi32(__m64 __X, __m64 __Y) { +__funline __m64 _mm_sign_pi32(__m64 __X, __m64 __Y) { return (__m64)__builtin_ia32_psignd((__v2si)__X, (__v2si)__Y); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) { +__funline __m128i _mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) { return (__m128i)__builtin_ia32_palignr128((__v2di)__X, (__v2di)__Y, __N * 8); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N) { +__funline __m64 _mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N) { return (__m64)__builtin_ia32_palignr((__v1di)__X, (__v1di)__Y, __N * 8); } #else @@ -174,39 +122,27 @@ extern __inline __m64 (int)(N)*8)) #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_abs_epi8(__m128i __X) { +__funline __m128i _mm_abs_epi8(__m128i __X) { return (__m128i)__builtin_ia32_pabsb128((__v16qi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_abs_epi16(__m128i __X) { +__funline __m128i _mm_abs_epi16(__m128i __X) { return (__m128i)__builtin_ia32_pabsw128((__v8hi)__X); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_abs_epi32(__m128i __X) { +__funline __m128i _mm_abs_epi32(__m128i __X) { return (__m128i)__builtin_ia32_pabsd128((__v4si)__X); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_abs_pi8(__m64 __X) { +__funline __m64 _mm_abs_pi8(__m64 __X) { return (__m64)__builtin_ia32_pabsb((__v8qi)__X); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_abs_pi16(__m64 __X) { +__funline __m64 _mm_abs_pi16(__m64 __X) { return (__m64)__builtin_ia32_pabsw((__v4hi)__X); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_abs_pi32(__m64 __X) { +__funline __m64 _mm_abs_pi32(__m64 __X) { return (__m64)__builtin_ia32_pabsd((__v2si)__X); } diff --git a/third_party/intel/vaesintrin.internal.h b/third_party/intel/vaesintrin.internal.h index 428db99a1..a71e548d7 100644 --- a/third_party/intel/vaesintrin.internal.h +++ b/third_party/intel/vaesintrin.internal.h @@ -9,27 +9,19 @@ #define __DISABLE_VAES__ #endif /* __VAES__ */ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_aesdec_epi128(__m256i __A, __m256i __B) { +__funline __m256i _mm256_aesdec_epi128(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_vaesdec_v32qi((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_aesdeclast_epi128(__m256i __A, __m256i __B) { +__funline __m256i _mm256_aesdeclast_epi128(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_vaesdeclast_v32qi((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_aesenc_epi128(__m256i __A, __m256i __B) { +__funline __m256i _mm256_aesenc_epi128(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_vaesenc_v32qi((__v32qi)__A, (__v32qi)__B); } -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_aesenclast_epi128(__m256i __A, __m256i __B) { +__funline __m256i _mm256_aesenclast_epi128(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_vaesenclast_v32qi((__v32qi)__A, (__v32qi)__B); } @@ -44,27 +36,19 @@ extern __inline __m256i #define __DISABLE_VAESF__ #endif /* __VAES__ */ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_aesdec_epi128(__m512i __A, __m512i __B) { +__funline __m512i _mm512_aesdec_epi128(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_vaesdec_v64qi((__v64qi)__A, (__v64qi)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_aesdeclast_epi128(__m512i __A, __m512i __B) { +__funline __m512i _mm512_aesdeclast_epi128(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_vaesdeclast_v64qi((__v64qi)__A, (__v64qi)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_aesenc_epi128(__m512i __A, __m512i __B) { +__funline __m512i _mm512_aesenc_epi128(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_vaesenc_v64qi((__v64qi)__A, (__v64qi)__B); } -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_aesenclast_epi128(__m512i __A, __m512i __B) { +__funline __m512i _mm512_aesenclast_epi128(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_vaesenclast_v64qi((__v64qi)__A, (__v64qi)__B); } diff --git a/third_party/intel/vpclmulqdqintrin.internal.h b/third_party/intel/vpclmulqdqintrin.internal.h index f988bd18a..49454f499 100644 --- a/third_party/intel/vpclmulqdqintrin.internal.h +++ b/third_party/intel/vpclmulqdqintrin.internal.h @@ -12,9 +12,8 @@ #endif /* __VPCLMULQDQF__ */ #ifdef __OPTIMIZE__ -extern __inline __m512i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm512_clmulepi64_epi128(__m512i __A, __m512i __B, const int __C) { +__funline __m512i _mm512_clmulepi64_epi128(__m512i __A, __m512i __B, + const int __C) { return (__m512i)__builtin_ia32_vpclmulqdq_v8di((__v8di)__A, (__v8di)__B, __C); } #else @@ -35,9 +34,8 @@ extern __inline __m512i #endif /* __VPCLMULQDQ__ */ #ifdef __OPTIMIZE__ -extern __inline __m256i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_clmulepi64_epi128(__m256i __A, __m256i __B, const int __C) { +__funline __m256i _mm256_clmulepi64_epi128(__m256i __A, __m256i __B, + const int __C) { return (__m256i)__builtin_ia32_vpclmulqdq_v4di((__v4di)__A, (__v4di)__B, __C); } #else diff --git a/third_party/intel/waitpkgintrin.internal.h b/third_party/intel/waitpkgintrin.internal.h index 1a659070a..3f4f19254 100644 --- a/third_party/intel/waitpkgintrin.internal.h +++ b/third_party/intel/waitpkgintrin.internal.h @@ -11,21 +11,15 @@ #define __DISABLE_WAITPKG__ #endif /* __WAITPKG__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _umonitor(void *__A) { +__funline void _umonitor(void *__A) { __builtin_ia32_umonitor(__A); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _umwait(unsigned int __A, unsigned long long __B) { +__funline unsigned char _umwait(unsigned int __A, unsigned long long __B) { return __builtin_ia32_umwait(__A, __B); } -extern __inline unsigned char - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _tpause(unsigned int __A, unsigned long long __B) { +__funline unsigned char _tpause(unsigned int __A, unsigned long long __B) { return __builtin_ia32_tpause(__A, __B); } diff --git a/third_party/intel/wbnoinvdintrin.internal.h b/third_party/intel/wbnoinvdintrin.internal.h index 9e78f8abf..72b06d30a 100644 --- a/third_party/intel/wbnoinvdintrin.internal.h +++ b/third_party/intel/wbnoinvdintrin.internal.h @@ -11,9 +11,7 @@ #define __DISABLE_WBNOINVD__ #endif /* __WBNOINVD__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _wbnoinvd(void) { +__funline void _wbnoinvd(void) { __builtin_ia32_wbnoinvd(); } diff --git a/third_party/intel/wmmintrin.internal.h b/third_party/intel/wmmintrin.internal.h index 9ddc5a24c..2a5819959 100644 --- a/third_party/intel/wmmintrin.internal.h +++ b/third_party/intel/wmmintrin.internal.h @@ -9,40 +9,28 @@ #define __DISABLE_AES__ #endif /* __AES__ */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_aesdec_si128(__m128i __X, __m128i __Y) { +__funline __m128i _mm_aesdec_si128(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_aesdec128((__v2di)__X, (__v2di)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_aesdeclast_si128(__m128i __X, __m128i __Y) { +__funline __m128i _mm_aesdeclast_si128(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__X, (__v2di)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_aesenc_si128(__m128i __X, __m128i __Y) { +__funline __m128i _mm_aesenc_si128(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_aesenc128((__v2di)__X, (__v2di)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_aesenclast_si128(__m128i __X, __m128i __Y) { +__funline __m128i _mm_aesenclast_si128(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_aesenclast128((__v2di)__X, (__v2di)__Y); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_aesimc_si128(__m128i __X) { +__funline __m128i _mm_aesimc_si128(__m128i __X) { return (__m128i)__builtin_ia32_aesimc128((__v2di)__X); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_aeskeygenassist_si128(__m128i __X, const int __C) { +__funline __m128i _mm_aeskeygenassist_si128(__m128i __X, const int __C) { return (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)__X, __C); } #else @@ -62,9 +50,7 @@ extern __inline __m128i #endif /* __PCLMUL__ */ #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I) { +__funline __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I) { return (__m128i)__builtin_ia32_pclmulqdq128((__v2di)__X, (__v2di)__Y, __I); } #else diff --git a/third_party/intel/xmmintrin.internal.h b/third_party/intel/xmmintrin.internal.h index 6e0567cc9..909a1e3a2 100644 --- a/third_party/intel/xmmintrin.internal.h +++ b/third_party/intel/xmmintrin.internal.h @@ -14,9 +14,7 @@ enum _mm_hint { }; #ifdef __OPTIMIZE__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_prefetch(const void *__P, enum _mm_hint __I) { +__funline void _mm_prefetch(const void *__P, enum _mm_hint __I) { __builtin_prefetch(__P, (__I & 0x4) >> 2, __I & 0x3); } #else @@ -65,491 +63,333 @@ typedef float __v4sf __attribute__((__vector_size__(16))); #define _MM_FLUSH_ZERO_ON 0x8000 #define _MM_FLUSH_ZERO_OFF 0x0000 -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_undefined_ps(void) { +__funline __m128 _mm_undefined_ps(void) { __m128 __Y = __Y; return __Y; } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setzero_ps(void) { +__funline __m128 _mm_setzero_ps(void) { return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f}; } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_add_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_addss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_sub_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_subss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mul_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_mul_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_mulss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_div_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_div_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_divss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sqrt_ss(__m128 __A) { +__funline __m128 _mm_sqrt_ss(__m128 __A) { return (__m128)__builtin_ia32_sqrtss((__v4sf)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rcp_ss(__m128 __A) { +__funline __m128 _mm_rcp_ss(__m128 __A) { return (__m128)__builtin_ia32_rcpss((__v4sf)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rsqrt_ss(__m128 __A) { +__funline __m128 _mm_rsqrt_ss(__m128 __A) { return (__m128)__builtin_ia32_rsqrtss((__v4sf)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_min_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_minss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_max_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_maxss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_add_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_add_ps(__m128 __A, __m128 __B) { return (__m128)((__v4sf)__A + (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sub_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_sub_ps(__m128 __A, __m128 __B) { return (__m128)((__v4sf)__A - (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mul_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_mul_ps(__m128 __A, __m128 __B) { return (__m128)((__v4sf)__A * (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_div_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_div_ps(__m128 __A, __m128 __B) { return (__m128)((__v4sf)__A / (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sqrt_ps(__m128 __A) { +__funline __m128 _mm_sqrt_ps(__m128 __A) { return (__m128)__builtin_ia32_sqrtps((__v4sf)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rcp_ps(__m128 __A) { +__funline __m128 _mm_rcp_ps(__m128 __A) { return (__m128)__builtin_ia32_rcpps((__v4sf)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rsqrt_ps(__m128 __A) { +__funline __m128 _mm_rsqrt_ps(__m128 __A) { return (__m128)__builtin_ia32_rsqrtps((__v4sf)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_min_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_minps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_max_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_maxps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_and_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_and_ps(__m128 __A, __m128 __B) { return __builtin_ia32_andps(__A, __B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_andnot_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_andnot_ps(__m128 __A, __m128 __B) { return __builtin_ia32_andnps(__A, __B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_or_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_or_ps(__m128 __A, __m128 __B) { return __builtin_ia32_orps(__A, __B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_xor_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_xor_ps(__m128 __A, __m128 __B) { return __builtin_ia32_xorps(__A, __B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpeq_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpeqss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmplt_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpltss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmple_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpless((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpgt_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movss( (__v4sf)__A, (__v4sf)__builtin_ia32_cmpltss((__v4sf)__B, (__v4sf)__A)); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpge_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movss( (__v4sf)__A, (__v4sf)__builtin_ia32_cmpless((__v4sf)__B, (__v4sf)__A)); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpneq_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpneqss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnlt_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpnlt_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpnltss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnle_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpnle_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpnless((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpngt_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpngt_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movss( (__v4sf)__A, (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__B, (__v4sf)__A)); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnge_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpnge_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movss( (__v4sf)__A, (__v4sf)__builtin_ia32_cmpnless((__v4sf)__B, (__v4sf)__A)); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpord_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpord_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpordss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpunord_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpunord_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpunordss((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpeq_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpeq_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpeqps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmplt_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmplt_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpltps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmple_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmple_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpleps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpgt_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpgt_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpgtps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpge_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpge_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpgeps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpneq_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpneq_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpneqps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnlt_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpnlt_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpnltps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnle_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpnle_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpnleps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpngt_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpngt_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpngtps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpnge_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpnge_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpngeps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpord_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpord_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpordps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmpunord_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_cmpunord_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_cmpunordps((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comieq_ss(__m128 __A, __m128 __B) { +__funline int _mm_comieq_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comieq((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comilt_ss(__m128 __A, __m128 __B) { +__funline int _mm_comilt_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comilt((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comile_ss(__m128 __A, __m128 __B) { +__funline int _mm_comile_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comile((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comigt_ss(__m128 __A, __m128 __B) { +__funline int _mm_comigt_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comigt((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comige_ss(__m128 __A, __m128 __B) { +__funline int _mm_comige_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comige((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comineq_ss(__m128 __A, __m128 __B) { +__funline int _mm_comineq_ss(__m128 __A, __m128 __B) { return __builtin_ia32_comineq((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomieq_ss(__m128 __A, __m128 __B) { +__funline int _mm_ucomieq_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomieq((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomilt_ss(__m128 __A, __m128 __B) { +__funline int _mm_ucomilt_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomilt((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomile_ss(__m128 __A, __m128 __B) { +__funline int _mm_ucomile_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomile((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomigt_ss(__m128 __A, __m128 __B) { +__funline int _mm_ucomigt_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomigt((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomige_ss(__m128 __A, __m128 __B) { +__funline int _mm_ucomige_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomige((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_ucomineq_ss(__m128 __A, __m128 __B) { +__funline int _mm_ucomineq_ss(__m128 __A, __m128 __B) { return __builtin_ia32_ucomineq((__v4sf)__A, (__v4sf)__B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtss_si32(__m128 __A) { +__funline int _mm_cvtss_si32(__m128 __A) { return __builtin_ia32_cvtss2si((__v4sf)__A); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_ss2si(__m128 __A) { +__funline int _mm_cvt_ss2si(__m128 __A) { return _mm_cvtss_si32(__A); } #ifdef __x86_64__ -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtss_si64(__m128 __A) { +__funline long long _mm_cvtss_si64(__m128 __A) { return __builtin_ia32_cvtss2si64((__v4sf)__A); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtss_si64x(__m128 __A) { +__funline long long _mm_cvtss_si64x(__m128 __A) { return __builtin_ia32_cvtss2si64((__v4sf)__A); } #endif -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtps_pi32(__m128 __A) { +__funline __m64 _mm_cvtps_pi32(__m128 __A) { return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_ps2pi(__m128 __A) { +__funline __m64 _mm_cvt_ps2pi(__m128 __A) { return _mm_cvtps_pi32(__A); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttss_si32(__m128 __A) { +__funline int _mm_cvttss_si32(__m128 __A) { return __builtin_ia32_cvttss2si((__v4sf)__A); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_ss2si(__m128 __A) { +__funline int _mm_cvtt_ss2si(__m128 __A) { return _mm_cvttss_si32(__A); } #ifdef __x86_64__ -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttss_si64(__m128 __A) { +__funline long long _mm_cvttss_si64(__m128 __A) { return __builtin_ia32_cvttss2si64((__v4sf)__A); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttss_si64x(__m128 __A) { +__funline long long _mm_cvttss_si64x(__m128 __A) { return __builtin_ia32_cvttss2si64((__v4sf)__A); } #endif -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvttps_pi32(__m128 __A) { +__funline __m64 _mm_cvttps_pi32(__m128 __A) { return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtt_ps2pi(__m128 __A) { +__funline __m64 _mm_cvtt_ps2pi(__m128 __A) { return _mm_cvttps_pi32(__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi32_ss(__m128 __A, int __B) { +__funline __m128 _mm_cvtsi32_ss(__m128 __A, int __B) { return (__m128)__builtin_ia32_cvtsi2ss((__v4sf)__A, __B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_si2ss(__m128 __A, int __B) { +__funline __m128 _mm_cvt_si2ss(__m128 __A, int __B) { return _mm_cvtsi32_ss(__A, __B); } #ifdef __x86_64__ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi64_ss(__m128 __A, long long __B) { +__funline __m128 _mm_cvtsi64_ss(__m128 __A, long long __B) { return (__m128)__builtin_ia32_cvtsi642ss((__v4sf)__A, __B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtsi64x_ss(__m128 __A, long long __B) { +__funline __m128 _mm_cvtsi64x_ss(__m128 __A, long long __B) { return (__m128)__builtin_ia32_cvtsi642ss((__v4sf)__A, __B); } #endif -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpi32_ps(__m128 __A, __m64 __B) { +__funline __m128 _mm_cvtpi32_ps(__m128 __A, __m64 __B) { return (__m128)__builtin_ia32_cvtpi2ps((__v4sf)__A, (__v2si)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvt_pi2ps(__m128 __A, __m64 __B) { +__funline __m128 _mm_cvt_pi2ps(__m128 __A, __m64 __B) { return _mm_cvtpi32_ps(__A, __B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpi16_ps(__m64 __A) { +__funline __m128 _mm_cvtpi16_ps(__m64 __A) { __v4hi __sign; __v2si __hisi, __losi; __v4sf __zero, __ra, __rb; @@ -562,9 +402,7 @@ extern __inline __m128 return (__m128)__builtin_ia32_movlhps(__ra, __rb); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpu16_ps(__m64 __A) { +__funline __m128 _mm_cvtpu16_ps(__m64 __A) { __v2si __hisi, __losi; __v4sf __zero, __ra, __rb; __losi = (__v2si)__builtin_ia32_punpcklwd((__v4hi)__A, (__v4hi)0LL); @@ -575,9 +413,7 @@ extern __inline __m128 return (__m128)__builtin_ia32_movlhps(__ra, __rb); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpi8_ps(__m64 __A) { +__funline __m128 _mm_cvtpi8_ps(__m64 __A) { __v8qi __sign; __sign = __builtin_ia32_pcmpgtb((__v8qi)0LL, (__v8qi)__A); @@ -587,25 +423,19 @@ extern __inline __m128 return _mm_cvtpi16_ps(__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpu8_ps(__m64 __A) { +__funline __m128 _mm_cvtpu8_ps(__m64 __A) { __A = (__m64)__builtin_ia32_punpcklbw((__v8qi)__A, (__v8qi)0LL); return _mm_cvtpu16_ps(__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) { +__funline __m128 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) { __v4sf __zero = (__v4sf)_mm_setzero_ps(); __v4sf __sfa = __builtin_ia32_cvtpi2ps(__zero, (__v2si)__A); __v4sf __sfb = __builtin_ia32_cvtpi2ps(__sfa, (__v2si)__B); return (__m128)__builtin_ia32_movlhps(__sfa, __sfb); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtps_pi16(__m128 __A) { +__funline __m64 _mm_cvtps_pi16(__m128 __A) { __v4sf __hisf = (__v4sf)__A; __v4sf __losf = __builtin_ia32_movhlps(__hisf, __hisf); __v2si __hisi = __builtin_ia32_cvtps2pi(__hisf); @@ -613,17 +443,13 @@ extern __inline __m64 return (__m64)__builtin_ia32_packssdw(__hisi, __losi); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtps_pi8(__m128 __A) { +__funline __m64 _mm_cvtps_pi8(__m128 __A) { __v4hi __tmp = (__v4hi)_mm_cvtps_pi16(__A); return (__m64)__builtin_ia32_packsswb(__tmp, (__v4hi)0LL); } #ifdef __OPTIMIZE__ -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) { +__funline __m128 _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) { return (__m128)__builtin_ia32_shufps((__v4sf)__A, (__v4sf)__B, __mask); } #else @@ -632,251 +458,172 @@ extern __inline __m128 (int)(MASK))) #endif -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpackhi_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_unpackhi_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_unpckhps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_unpacklo_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_unpacklo_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_unpcklps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadh_pi(__m128 __A, __m64 const *__P) { +__funline __m128 _mm_loadh_pi(__m128 __A, __m64 const *__P) { return (__m128)__builtin_ia32_loadhps((__v4sf)__A, (const __v2sf *)__P); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storeh_pi(__m64 *__P, __m128 __A) { +__funline void _mm_storeh_pi(__m64 *__P, __m128 __A) { __builtin_ia32_storehps((__v2sf *)__P, (__v4sf)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movehl_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_movehl_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movhlps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movelh_ps(__m128 __A, __m128 __B) { +__funline __m128 _mm_movelh_ps(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movlhps((__v4sf)__A, (__v4sf)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadl_pi(__m128 __A, __m64 const *__P) { +__funline __m128 _mm_loadl_pi(__m128 __A, __m64 const *__P) { return (__m128)__builtin_ia32_loadlps((__v4sf)__A, (const __v2sf *)__P); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storel_pi(__m64 *__P, __m128 __A) { +__funline void _mm_storel_pi(__m64 *__P, __m128 __A) { __builtin_ia32_storelps((__v2sf *)__P, (__v4sf)__A); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movemask_ps(__m128 __A) { +__funline int _mm_movemask_ps(__m128 __A) { return __builtin_ia32_movmskps((__v4sf)__A); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_getcsr(void) { +__funline unsigned int _mm_getcsr(void) { return __builtin_ia32_stmxcsr(); } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _MM_GET_EXCEPTION_STATE(void) { +__funline unsigned int _MM_GET_EXCEPTION_STATE(void) { return _mm_getcsr() & _MM_EXCEPT_MASK; } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _MM_GET_EXCEPTION_MASK(void) { +__funline unsigned int _MM_GET_EXCEPTION_MASK(void) { return _mm_getcsr() & _MM_MASK_MASK; } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _MM_GET_ROUNDING_MODE(void) { +__funline unsigned int _MM_GET_ROUNDING_MODE(void) { return _mm_getcsr() & _MM_ROUND_MASK; } -extern __inline unsigned int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _MM_GET_FLUSH_ZERO_MODE(void) { +__funline unsigned int _MM_GET_FLUSH_ZERO_MODE(void) { return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setcsr(unsigned int __I) { +__funline void _mm_setcsr(unsigned int __I) { __builtin_ia32_ldmxcsr(__I); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _MM_SET_EXCEPTION_STATE(unsigned int __mask) { +__funline void _MM_SET_EXCEPTION_STATE(unsigned int __mask) { _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _MM_SET_EXCEPTION_MASK(unsigned int __mask) { +__funline void _MM_SET_EXCEPTION_MASK(unsigned int __mask) { _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _MM_SET_ROUNDING_MODE(unsigned int __mode) { +__funline void _MM_SET_ROUNDING_MODE(unsigned int __mode) { _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _MM_SET_FLUSH_ZERO_MODE(unsigned int __mode) { +__funline void _MM_SET_FLUSH_ZERO_MODE(unsigned int __mode) { _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_ss(float __F) { +__funline __m128 _mm_set_ss(float __F) { return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f}; } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set1_ps(float __F) { +__funline __m128 _mm_set1_ps(float __F) { return __extension__(__m128)(__v4sf){__F, __F, __F, __F}; } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_set_ps1(float __F) { +__funline __m128 _mm_set_ps1(float __F) { return _mm_set1_ps(__F); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_load_ss(float const *__P) { +__funline __m128 _mm_load_ss(float const *__P) { return _mm_set_ss(*__P); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_load1_ps(float const *__P) { +__funline __m128 _mm_load1_ps(float const *__P) { return _mm_set1_ps(*__P); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_load_ps1(float const *__P) { +__funline __m128 _mm_load_ps1(float const *__P) { return _mm_load1_ps(__P); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_load_ps(float const *__P) { +__funline __m128 _mm_load_ps(float const *__P) { return *(__m128 *)__P; } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadu_ps(float const *__P) { +__funline __m128 _mm_loadu_ps(float const *__P) { return *(__m128_u *)__P; } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_loadr_ps(float const *__P) { +__funline __m128 _mm_loadr_ps(float const *__P) { __v4sf __tmp = *(__v4sf *)__P; return (__m128)__builtin_ia32_shufps(__tmp, __tmp, _MM_SHUFFLE(0, 1, 2, 3)); } -extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, - __artificial__)) -_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) { +__funline __m128 _mm_set_ps(const float __Z, const float __Y, const float __X, + const float __W) { return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z}; } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_setr_ps(float __Z, float __Y, float __X, float __W) { +__funline __m128 _mm_setr_ps(float __Z, float __Y, float __X, float __W) { return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W}; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_store_ss(float *__P, __m128 __A) { +__funline void _mm_store_ss(float *__P, __m128 __A) { *__P = ((__v4sf)__A)[0]; } -extern __inline float - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cvtss_f32(__m128 __A) { +__funline float _mm_cvtss_f32(__m128 __A) { return ((__v4sf)__A)[0]; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_store_ps(float *__P, __m128 __A) { +__funline void _mm_store_ps(float *__P, __m128 __A) { *(__m128 *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storeu_ps(float *__P, __m128 __A) { +__funline void _mm_storeu_ps(float *__P, __m128 __A) { *(__m128_u *)__P = __A; } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_store1_ps(float *__P, __m128 __A) { +__funline void _mm_store1_ps(float *__P, __m128 __A) { __v4sf __va = (__v4sf)__A; __v4sf __tmp = __builtin_ia32_shufps(__va, __va, _MM_SHUFFLE(0, 0, 0, 0)); _mm_storeu_ps(__P, __tmp); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_store_ps1(float *__P, __m128 __A) { +__funline void _mm_store_ps1(float *__P, __m128 __A) { _mm_store1_ps(__P, __A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_storer_ps(float *__P, __m128 __A) { +__funline void _mm_storer_ps(float *__P, __m128 __A) { __v4sf __va = (__v4sf)__A; __v4sf __tmp = __builtin_ia32_shufps(__va, __va, _MM_SHUFFLE(0, 1, 2, 3)); _mm_store_ps(__P, __tmp); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_move_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_move_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_shuffle( (__v4sf)__A, (__v4sf)__B, __extension__(__attribute__((__vector_size__(16))) int){4, 1, 2, 3}); } #ifdef __OPTIMIZE__ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_pi16(__m64 const __A, int const __N) { +__funline int _mm_extract_pi16(__m64 const __A, int const __N) { return __builtin_ia32_vec_ext_v4hi((__v4hi)__A, __N); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pextrw(__m64 const __A, int const __N) { +__funline int _m_pextrw(__m64 const __A, int const __N) { return _mm_extract_pi16(__A, __N); } #else @@ -887,15 +634,11 @@ extern __inline int #endif #ifdef __OPTIMIZE__ -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_pi16(__m64 const __A, int const __D, int const __N) { +__funline __m64 _mm_insert_pi16(__m64 const __A, int const __D, int const __N) { return (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)__A, __D, __N); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pinsrw(__m64 const __A, int const __D, int const __N) { +__funline __m64 _m_pinsrw(__m64 const __A, int const __D, int const __N) { return _mm_insert_pi16(__A, __D, __N); } #else @@ -905,88 +648,60 @@ extern __inline __m64 #define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N) #endif -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_pi16(__m64 __A, __m64 __B) { +__funline __m64 _mm_max_pi16(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pmaxsw((__v4hi)__A, (__v4hi)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pmaxsw(__m64 __A, __m64 __B) { +__funline __m64 _m_pmaxsw(__m64 __A, __m64 __B) { return _mm_max_pi16(__A, __B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_max_pu8(__m64 __A, __m64 __B) { +__funline __m64 _mm_max_pu8(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pmaxub((__v8qi)__A, (__v8qi)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pmaxub(__m64 __A, __m64 __B) { +__funline __m64 _m_pmaxub(__m64 __A, __m64 __B) { return _mm_max_pu8(__A, __B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_pi16(__m64 __A, __m64 __B) { +__funline __m64 _mm_min_pi16(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pminsw((__v4hi)__A, (__v4hi)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pminsw(__m64 __A, __m64 __B) { +__funline __m64 _m_pminsw(__m64 __A, __m64 __B) { return _mm_min_pi16(__A, __B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_min_pu8(__m64 __A, __m64 __B) { +__funline __m64 _mm_min_pu8(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pminub((__v8qi)__A, (__v8qi)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pminub(__m64 __A, __m64 __B) { +__funline __m64 _m_pminub(__m64 __A, __m64 __B) { return _mm_min_pu8(__A, __B); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_movemask_pi8(__m64 __A) { +__funline int _mm_movemask_pi8(__m64 __A) { return __builtin_ia32_pmovmskb((__v8qi)__A); } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pmovmskb(__m64 __A) { +__funline int _m_pmovmskb(__m64 __A) { return _mm_movemask_pi8(__A); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_mulhi_pu16(__m64 __A, __m64 __B) { +__funline __m64 _mm_mulhi_pu16(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pmulhuw((__v4hi)__A, (__v4hi)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pmulhuw(__m64 __A, __m64 __B) { +__funline __m64 _m_pmulhuw(__m64 __A, __m64 __B) { return _mm_mulhi_pu16(__A, __B); } #ifdef __OPTIMIZE__ -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shuffle_pi16(__m64 __A, int const __N) { +__funline __m64 _mm_shuffle_pi16(__m64 __A, int const __N) { return (__m64)__builtin_ia32_pshufw((__v4hi)__A, __N); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pshufw(__m64 __A, int const __N) { +__funline __m64 _m_pshufw(__m64 __A, int const __N) { return _mm_shuffle_pi16(__A, __N); } #else @@ -996,69 +711,47 @@ extern __inline __m64 #define _m_pshufw(A, N) _mm_shuffle_pi16(A, N) #endif -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) { +__funline void _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) { __builtin_ia32_maskmovq((__v8qi)__A, (__v8qi)__N, __P); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_maskmovq(__m64 __A, __m64 __N, char *__P) { +__funline void _m_maskmovq(__m64 __A, __m64 __N, char *__P) { _mm_maskmove_si64(__A, __N, __P); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_avg_pu8(__m64 __A, __m64 __B) { +__funline __m64 _mm_avg_pu8(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pavgb((__v8qi)__A, (__v8qi)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pavgb(__m64 __A, __m64 __B) { +__funline __m64 _m_pavgb(__m64 __A, __m64 __B) { return _mm_avg_pu8(__A, __B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_avg_pu16(__m64 __A, __m64 __B) { +__funline __m64 _mm_avg_pu16(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_pavgw((__v4hi)__A, (__v4hi)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_pavgw(__m64 __A, __m64 __B) { +__funline __m64 _m_pavgw(__m64 __A, __m64 __B) { return _mm_avg_pu16(__A, __B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sad_pu8(__m64 __A, __m64 __B) { +__funline __m64 _mm_sad_pu8(__m64 __A, __m64 __B) { return (__m64)__builtin_ia32_psadbw((__v8qi)__A, (__v8qi)__B); } -extern __inline __m64 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _m_psadbw(__m64 __A, __m64 __B) { +__funline __m64 _m_psadbw(__m64 __A, __m64 __B) { return _mm_sad_pu8(__A, __B); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_stream_pi(__m64 *__P, __m64 __A) { +__funline void _mm_stream_pi(__m64 *__P, __m64 __A) { __builtin_ia32_movntq((unsigned long long *)__P, (unsigned long long)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_stream_ps(float *__P, __m128 __A) { +__funline void _mm_stream_ps(float *__P, __m128 __A) { __builtin_ia32_movntps(__P, (__v4sf)__A); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sfence(void) { +__funline void _mm_sfence(void) { __builtin_ia32_sfence(); } @@ -1082,9 +775,7 @@ extern __inline void #pragma GCC pop_options #endif /* __DISABLE_SSE__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_pause(void) { +__funline void _mm_pause(void) { __builtin_ia32_pause(); } diff --git a/third_party/intel/xopintrin.internal.h b/third_party/intel/xopintrin.internal.h index 992f5a1ea..25feaabda 100644 --- a/third_party/intel/xopintrin.internal.h +++ b/third_party/intel/xopintrin.internal.h @@ -13,192 +13,134 @@ #define __DISABLE_XOP__ #endif /* __XOP__ */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); } /* Packed Integer Horizontal Add and Subtract */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddw_epi8(__m128i __A) { +__funline __m128i _mm_haddw_epi8(__m128i __A) { return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddd_epi8(__m128i __A) { +__funline __m128i _mm_haddd_epi8(__m128i __A) { return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddq_epi8(__m128i __A) { +__funline __m128i _mm_haddq_epi8(__m128i __A) { return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddd_epi16(__m128i __A) { +__funline __m128i _mm_haddd_epi16(__m128i __A) { return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddq_epi16(__m128i __A) { +__funline __m128i _mm_haddq_epi16(__m128i __A) { return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddq_epi32(__m128i __A) { +__funline __m128i _mm_haddq_epi32(__m128i __A) { return (__m128i)__builtin_ia32_vphadddq((__v4si)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddw_epu8(__m128i __A) { +__funline __m128i _mm_haddw_epu8(__m128i __A) { return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddd_epu8(__m128i __A) { +__funline __m128i _mm_haddd_epu8(__m128i __A) { return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddq_epu8(__m128i __A) { +__funline __m128i _mm_haddq_epu8(__m128i __A) { return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddd_epu16(__m128i __A) { +__funline __m128i _mm_haddd_epu16(__m128i __A) { return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddq_epu16(__m128i __A) { +__funline __m128i _mm_haddq_epu16(__m128i __A) { return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_haddq_epu32(__m128i __A) { +__funline __m128i _mm_haddq_epu32(__m128i __A) { return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsubw_epi8(__m128i __A) { +__funline __m128i _mm_hsubw_epi8(__m128i __A) { return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsubd_epi16(__m128i __A) { +__funline __m128i _mm_hsubd_epi16(__m128i __A) { return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_hsubq_epi32(__m128i __A) { +__funline __m128i _mm_hsubq_epi32(__m128i __A) { return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A); } /* Vector conditional move and permute */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpcmov(__A, __B, __C); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) { +__funline __m128i _mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) { return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C); } @@ -206,52 +148,36 @@ extern __inline __m128i /* Packed Integer Rotates and Shifts Rotates - Non-Immediate form */ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rot_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_rot_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rot_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_rot_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rot_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_rot_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_rot_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_rot_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B); } #ifdef __OPTIMIZE__ -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_roti_epi8(__m128i __A, const int __B) { +__funline __m128i _mm_roti_epi8(__m128i __A, const int __B) { return (__m128i)__builtin_ia32_vprotbi((__v16qi)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_roti_epi16(__m128i __A, const int __B) { +__funline __m128i _mm_roti_epi16(__m128i __A, const int __B) { return (__m128i)__builtin_ia32_vprotwi((__v8hi)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_roti_epi32(__m128i __A, const int __B) { +__funline __m128i _mm_roti_epi32(__m128i __A, const int __B) { return (__m128i)__builtin_ia32_vprotdi((__v4si)__A, __B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_roti_epi64(__m128i __A, const int __B) { +__funline __m128i _mm_roti_epi64(__m128i __A, const int __B) { return (__m128i)__builtin_ia32_vprotqi((__v2di)__A, __B); } #else @@ -265,501 +191,341 @@ extern __inline __m128i ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (int)(N))) #endif -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shl_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_shl_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shl_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_shl_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shl_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_shl_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_shl_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_shl_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_sha_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_sha_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_sha_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_sha_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_sha_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comlt_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comlt_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomltub((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comle_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comle_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomleub((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comgt_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comgt_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgtub((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comge_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comge_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgeub((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comeq_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comeq_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomequb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comneq_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comneq_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomnequb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comfalse_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comfalse_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomfalseub((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comtrue_epu8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comtrue_epu8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomtrueub((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comlt_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comlt_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomltuw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comle_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comle_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomleuw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comgt_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comgt_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgtuw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comge_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comge_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgeuw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comeq_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comeq_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomequw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comneq_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comneq_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomnequw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comfalse_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comfalse_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomfalseuw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comtrue_epu16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comtrue_epu16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomtrueuw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comlt_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comlt_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomltud((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comle_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comle_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomleud((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comgt_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comgt_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgtud((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comge_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comge_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgeud((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comeq_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comeq_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomequd((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comneq_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comneq_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomnequd((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comfalse_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comfalse_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomfalseud((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comtrue_epu32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comtrue_epu32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomtrueud((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comlt_epu64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comlt_epu64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomltuq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comle_epu64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comle_epu64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomleuq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comgt_epu64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comgt_epu64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgtuq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comge_epu64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comge_epu64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgeuq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comeq_epu64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comeq_epu64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomequq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comneq_epu64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comneq_epu64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomnequq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comfalse_epu64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comfalse_epu64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomfalseuq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comtrue_epu64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comtrue_epu64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomtrueuq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comlt_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comlt_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomltb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comle_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comle_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomleb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comgt_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comgt_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgtb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comge_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comge_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgeb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comeq_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comeq_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomeqb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comneq_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comneq_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomneqb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comfalse_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comfalse_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomfalseb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comtrue_epi8(__m128i __A, __m128i __B) { +__funline __m128i _mm_comtrue_epi8(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomtrueb((__v16qi)__A, (__v16qi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comlt_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comlt_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomltw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comle_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comle_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomlew((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comgt_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comgt_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgtw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comge_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comge_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgew((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comeq_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comeq_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomeqw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comneq_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comneq_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomneqw((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comfalse_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comfalse_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomfalsew((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comtrue_epi16(__m128i __A, __m128i __B) { +__funline __m128i _mm_comtrue_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomtruew((__v8hi)__A, (__v8hi)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comlt_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comlt_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomltd((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comle_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comle_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomled((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comgt_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comgt_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgtd((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comge_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comge_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomged((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comeq_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comeq_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomeqd((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comneq_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comneq_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomneqd((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comfalse_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comfalse_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomfalsed((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comtrue_epi32(__m128i __A, __m128i __B) { +__funline __m128i _mm_comtrue_epi32(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomtrued((__v4si)__A, (__v4si)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comlt_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comlt_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomltq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comle_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comle_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomleq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comgt_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comgt_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgtq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comge_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comge_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomgeq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comeq_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comeq_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomeqq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comneq_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comneq_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomneqq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comfalse_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comfalse_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomfalseq((__v2di)__A, (__v2di)__B); } -extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_comtrue_epi64(__m128i __A, __m128i __B) { +__funline __m128i _mm_comtrue_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpcomtrueq((__v2di)__A, (__v2di)__B); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_frcz_ps(__m128 __A) { +__funline __m128 _mm_frcz_ps(__m128 __A) { return (__m128)__builtin_ia32_vfrczps((__v4sf)__A); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_frcz_pd(__m128d __A) { +__funline __m128d _mm_frcz_pd(__m128d __A) { return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_frcz_ss(__m128 __A, __m128 __B) { +__funline __m128 _mm_frcz_ss(__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_movss( (__v4sf)__A, (__v4sf)__builtin_ia32_vfrczss((__v4sf)__B)); } -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_frcz_sd(__m128d __A, __m128d __B) { +__funline __m128d _mm_frcz_sd(__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_movsd( (__v2df)__A, (__v2df)__builtin_ia32_vfrczsd((__v2df)__B)); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_frcz_ps(__m256 __A) { +__funline __m256 _mm256_frcz_ps(__m256 __A) { return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_frcz_pd(__m256d __A) { +__funline __m256d _mm256_frcz_pd(__m256d __A) { return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A); } #ifdef __OPTIMIZE__ -extern __inline __m128d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permute2_pd(__m128d __X, __m128d __Y, __m128i __C, const int __I) { +__funline __m128d _mm_permute2_pd(__m128d __X, __m128d __Y, __m128i __C, + const int __I) { return (__m128d)__builtin_ia32_vpermil2pd((__v2df)__X, (__v2df)__Y, (__v2di)__C, __I); } -extern __inline __m256d - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permute2_pd(__m256d __X, __m256d __Y, __m256i __C, const int __I) { +__funline __m256d _mm256_permute2_pd(__m256d __X, __m256d __Y, __m256i __C, + const int __I) { return (__m256d)__builtin_ia32_vpermil2pd256((__v4df)__X, (__v4df)__Y, (__v4di)__C, __I); } -extern __inline __m128 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_permute2_ps(__m128 __X, __m128 __Y, __m128i __C, const int __I) { +__funline __m128 _mm_permute2_ps(__m128 __X, __m128 __Y, __m128i __C, + const int __I) { return (__m128)__builtin_ia32_vpermil2ps((__v4sf)__X, (__v4sf)__Y, (__v4si)__C, __I); } -extern __inline __m256 - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm256_permute2_ps(__m256 __X, __m256 __Y, __m256i __C, const int __I) { +__funline __m256 _mm256_peeeeeeermute2_ps(__m256 __X, __m256 __Y, __m256i __C, + const int __I) { return (__m256)__builtin_ia32_vpermil2ps256((__v8sf)__X, (__v8sf)__Y, (__v8si)__C, __I); } diff --git a/third_party/intel/xsavecintrin.internal.h b/third_party/intel/xsavecintrin.internal.h index 35ae37ac5..6daebde60 100644 --- a/third_party/intel/xsavecintrin.internal.h +++ b/third_party/intel/xsavecintrin.internal.h @@ -11,16 +11,12 @@ #define __DISABLE_XSAVEC__ #endif /* __XSAVEC__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xsavec(void *__P, long long __M) { +__funline void _xsavec(void *__P, long long __M) { __builtin_ia32_xsavec(__P, __M); } #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xsavec64(void *__P, long long __M) { +__funline void _xsavec64(void *__P, long long __M) { __builtin_ia32_xsavec64(__P, __M); } #endif diff --git a/third_party/intel/xsaveintrin.internal.h b/third_party/intel/xsaveintrin.internal.h index 4b7c4b5f9..76070f620 100644 --- a/third_party/intel/xsaveintrin.internal.h +++ b/third_party/intel/xsaveintrin.internal.h @@ -11,40 +11,28 @@ #define __DISABLE_XSAVE__ #endif /* __XSAVE__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xsave(void *__P, long long __M) { +__funline void _xsave(void *__P, long long __M) { __builtin_ia32_xsave(__P, __M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xrstor(void *__P, long long __M) { +__funline void _xrstor(void *__P, long long __M) { __builtin_ia32_xrstor(__P, __M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xsetbv(unsigned int __A, long long __V) { +__funline void _xsetbv(unsigned int __A, long long __V) { __builtin_ia32_xsetbv(__A, __V); } -extern __inline long long - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xgetbv(unsigned int __A) { +__funline long long _xgetbv(unsigned int __A) { return __builtin_ia32_xgetbv(__A); } #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xsave64(void *__P, long long __M) { +__funline void _xsave64(void *__P, long long __M) { __builtin_ia32_xsave64(__P, __M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xrstor64(void *__P, long long __M) { +__funline void _xrstor64(void *__P, long long __M) { __builtin_ia32_xrstor64(__P, __M); } #endif diff --git a/third_party/intel/xsaveoptintrin.internal.h b/third_party/intel/xsaveoptintrin.internal.h index 173779f92..45d39ddb4 100644 --- a/third_party/intel/xsaveoptintrin.internal.h +++ b/third_party/intel/xsaveoptintrin.internal.h @@ -11,16 +11,12 @@ #define __DISABLE_XSAVEOPT__ #endif /* __XSAVEOPT__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xsaveopt(void *__P, long long __M) { +__funline void _xsaveopt(void *__P, long long __M) { __builtin_ia32_xsaveopt(__P, __M); } #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xsaveopt64(void *__P, long long __M) { +__funline void _xsaveopt64(void *__P, long long __M) { __builtin_ia32_xsaveopt64(__P, __M); } #endif diff --git a/third_party/intel/xsavesintrin.internal.h b/third_party/intel/xsavesintrin.internal.h index 765f64778..7a1b1e769 100644 --- a/third_party/intel/xsavesintrin.internal.h +++ b/third_party/intel/xsavesintrin.internal.h @@ -11,28 +11,20 @@ #define __DISABLE_XSAVES__ #endif /* __XSAVES__ */ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xsaves(void *__P, long long __M) { +__funline void _xsaves(void *__P, long long __M) { __builtin_ia32_xsaves(__P, __M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xrstors(void *__P, long long __M) { +__funline void _xrstors(void *__P, long long __M) { __builtin_ia32_xrstors(__P, __M); } #ifdef __x86_64__ -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xrstors64(void *__P, long long __M) { +__funline void _xrstors64(void *__P, long long __M) { __builtin_ia32_xrstors64(__P, __M); } -extern __inline void - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xsaves64(void *__P, long long __M) { +__funline void _xsaves64(void *__P, long long __M) { __builtin_ia32_xsaves64(__P, __M); } #endif diff --git a/third_party/intel/xtestintrin.internal.h b/third_party/intel/xtestintrin.internal.h index f417dcad1..bc58a51d4 100644 --- a/third_party/intel/xtestintrin.internal.h +++ b/third_party/intel/xtestintrin.internal.h @@ -11,9 +11,7 @@ #define __DISABLE_RTM__ #endif /* __RTM__ */ -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _xtest(void) { +__funline int _xtest(void) { return __builtin_ia32_xtest(); } diff --git a/tool/emacs/cosmo-c-keywords.el b/tool/emacs/cosmo-c-keywords.el index 547520b3e..acd02e23b 100644 --- a/tool/emacs/cosmo-c-keywords.el +++ b/tool/emacs/cosmo-c-keywords.el @@ -26,6 +26,7 @@ (cosmo '("__msabi" + "__funline" "function" "offsetof" "microarchitecture"