Release Cosmopolitan v3.8.0

This change switches c++ exception handling from sjlj to standard dwarf.
It's needed because clang for aarch64 doesn't support sjlj. It turns out
that libunwind had a bare-metal configuration that made this easy to do.

This change gets the new experimental cosmocc -mclang flag in a state of
working so well that it can now be used to build all of llamafile and it
goes 3x faster in terms of build latency, without trading away any perf.

The int_fast16_t and int_fast32_t types are now always defined as 32-bit
in the interest of having more abi consistency between cosmocc -mgcc and
-mclang mode.
This commit is contained in:
Justine Tunney 2024-08-30 20:12:26 -07:00
parent 5b9862907c
commit c9152b6f14
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
188 changed files with 199063 additions and 636 deletions

View file

@ -0,0 +1,140 @@
/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __WMMINTRIN_H
#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
#endif
#ifndef __WMMINTRIN_AES_H
#define __WMMINTRIN_AES_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
/// Performs a single round of AES encryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the encrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesenc_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
}
/// Performs the final round of AES encryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the encrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesenclast_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
}
/// Performs a single round of AES decryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the decrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesdec_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
}
/// Performs the final round of AES decryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the decrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesdeclast_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
}
/// Applies the AES InvMixColumns() transformation to an expanded key
/// contained in the source operand, and writes the result to the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the expanded key.
/// \returns A 128-bit integer vector containing the transformed value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesimc_si128(__m128i __V)
{
return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
}
/// Generates a round key for AES encryption, operating on 128-bit data
/// specified in the first source operand and using an 8-bit round constant
/// specified by the second source operand, and writes the result to the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
/// \endcode
///
/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
///
/// \param C
/// A 128-bit integer vector that is used to generate the AES encryption key.
/// \param R
/// An 8-bit round constant used to generate the AES encryption key.
/// \returns A 128-bit round key for AES encryption.
#define _mm_aeskeygenassist_si128(C, R) \
((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
#undef __DEFAULT_FN_ATTRS
#endif /* __WMMINTRIN_AES_H */

View file

@ -0,0 +1,48 @@
/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __WMMINTRIN_H
#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
#endif
#ifndef __WMMINTRIN_PCLMUL_H
#define __WMMINTRIN_PCLMUL_H
/// Multiplies two 64-bit integer values, which are selected from source
/// operands using the immediate-value operand. The multiplication is a
/// carry-less multiplication, and the 128-bit integer product is stored in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
/// \endcode
///
/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
///
/// \param X
/// A 128-bit vector of [2 x i64] containing one of the source operands.
/// \param Y
/// A 128-bit vector of [2 x i64] containing one of the source operands.
/// \param I
/// An immediate value specifying which 64-bit values to select from the
/// operands. Bit 0 is used to select a value from operand \a X, and bit
/// 4 is used to select a value from operand \a Y: \n
/// Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
/// Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
/// Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
/// Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
/// \returns The 128-bit integer vector containing the result of the carry-less
/// multiplication of the selected 64-bit values.
#define _mm_clmulepi64_si128(X, Y, I) \
((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
(__v2di)(__m128i)(Y), (char)(I)))
#endif /* __WMMINTRIN_PCLMUL_H */

160
third_party/intel/clang/adcintrin.h vendored Normal file
View file

@ -0,0 +1,160 @@
/*===---- adcintrin.h - ADC intrinsics -------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __ADCINTRIN_H
#define __ADCINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
/* Use C++ inline semantics in C++, GNU inline for C mode. */
#if defined(__cplusplus)
#define __INLINE __inline
#else
#define __INLINE static __inline
#endif
#if defined(__cplusplus)
extern "C" {
#endif
/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
/// by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
/// at \a __p, and returns the 8-bit carry-out (carry flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store32(__p, __x + __y + temp)
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c ADC instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// A 32-bit unsigned addend.
/// \param __y
/// A 32-bit unsigned addend.
/// \param __p
/// Pointer to memory for storing the sum.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
unsigned int __x,
unsigned int __y,
unsigned int *__p) {
return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
}
/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
/// flag \a __cf, and subtracts the result from unsigned 32-bit integer
/// \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
/// and returns the 8-bit carry-out (carry or overflow flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store32(__p, __x - (__y + temp))
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c SBB instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// The 32-bit unsigned minuend.
/// \param __y
/// The 32-bit unsigned subtrahend.
/// \param __p
/// Pointer to memory for storing the difference.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
unsigned int __x,
unsigned int __y,
unsigned int *__p) {
return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
}
#ifdef __x86_64__
/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
/// by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
/// at \a __p, and returns the 8-bit carry-out (carry flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store64(__p, __x + __y + temp)
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c ADC instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// A 64-bit unsigned addend.
/// \param __y
/// A 64-bit unsigned addend.
/// \param __p
/// Pointer to memory for storing the sum.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS
_addcarry_u64(unsigned char __cf, unsigned long long __x,
unsigned long long __y, unsigned long long *__p) {
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
}
/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
/// flag \a __cf, and subtracts the result from unsigned 64-bit integer
/// \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
/// and returns the 8-bit carry-out (carry or overflow flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store64(__p, __x - (__y + temp))
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c ADC instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// The 64-bit unsigned minuend.
/// \param __y
/// The 64-bit unsigned subtrahend.
/// \param __p
/// Pointer to memory for storing the difference.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS
_subborrow_u64(unsigned char __cf, unsigned long long __x,
unsigned long long __y, unsigned long long *__p) {
return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
}
#endif
#if defined(__cplusplus)
}
#endif
#undef __INLINE
#undef __DEFAULT_FN_ATTRS
#endif /* __ADCINTRIN_H */

102
third_party/intel/clang/adxintrin.h vendored Normal file
View file

@ -0,0 +1,102 @@
/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __ADXINTRIN_H
#define __ADXINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("adx")))
/* Use C++ inline semantics in C++, GNU inline for C mode. */
#if defined(__cplusplus)
#define __INLINE __inline
#else
#define __INLINE static __inline
#endif
#if defined(__cplusplus)
extern "C" {
#endif
/* Intrinsics that are available only if __ADX__ is defined. */
/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
/// by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
/// at \a __p, and returns the 8-bit carry-out (carry flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store32(__p, __x + __y + temp)
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c ADCX instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// A 32-bit unsigned addend.
/// \param __y
/// A 32-bit unsigned addend.
/// \param __p
/// Pointer to memory for storing the sum.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
unsigned int __x,
unsigned int __y,
unsigned int *__p) {
return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
}
#ifdef __x86_64__
/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
/// by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
/// at \a __p, and returns the 8-bit carry-out (carry flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store64(__p, __x + __y + temp)
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c ADCX instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// A 64-bit unsigned addend.
/// \param __y
/// A 64-bit unsigned addend.
/// \param __p
/// Pointer to memory for storing the sum.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS
_addcarryx_u64(unsigned char __cf, unsigned long long __x,
unsigned long long __y, unsigned long long *__p) {
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
}
#endif
#if defined(__cplusplus)
}
#endif
#undef __INLINE
#undef __DEFAULT_FN_ATTRS
#endif /* __ADXINTRIN_H */

183
third_party/intel/clang/ammintrin.h vendored Normal file
View file

@ -0,0 +1,183 @@
/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __AMMINTRIN_H
#define __AMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include "pmmintrin.h"
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
/// Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
/// \endcode
///
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param x
/// The value from which bits are extracted.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
/// [63:0] of parameter \a x are extracted. If the length is zero but the
/// index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
/// extracted from the source operand.
#define _mm_extracti_si64(x, len, idx) \
((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
(char)(len), (char)(idx)))
/// Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index and of the length specified by
/// \a __y.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param __x
/// The value from which bits are extracted.
/// \param __y
/// Specifies the index of the least significant bit at [13:8] and the
/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
/// length is interpreted as 64. If the sum of the index and length is
/// greater than 64, the result is undefined. If the length and index are
/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
/// from the source operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_extract_si64(__m128i __x, __m128i __y)
{
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
}
/// Inserts bits of a specified length from the source integer vector
/// \a y into the lower 64 bits of the destination integer vector \a x at
/// the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
/// const int idx);
/// \endcode
///
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length \a len and by the index \a idx specifying the
/// least significant bit.
/// \param y
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand \a y of length \a len.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
/// destination operand \a x with the specified bitfields replaced by the
/// lower bits of source operand \a y. The upper 64 bits of the return value
/// are undefined.
#define _mm_inserti_si64(x, y, len, idx) \
((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
(__v2di)(__m128i)(y), \
(char)(len), (char)(idx)))
/// Inserts bits of a specified length from the source integer vector
/// \a __y into the lower 64 bits of the destination integer vector \a __x
/// at the index and of the length specified by \a __y.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param __x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length and by the index of the least significant bit
/// specified by operand \a __y.
/// \param __y
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand \a __y with length
/// specified by bits [69:64]. These are inserted into the destination at the
/// index specified by bits [77:72]; all other bits are ignored. If bits
/// [69:64] are zero, the length is interpreted as 64. If the sum of the
/// index and length is greater than 64, the result is undefined. If the
/// length and index are both zero, bits [63:0] of parameter \a __y are
/// inserted into parameter \a __x. If the length is zero but the index is
/// non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
/// destination operand \a __x with the specified bitfields replaced by the
/// lower bits of source operand \a __y. The upper 64 bits of the return
/// value are undefined.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_insert_si64(__m128i __x, __m128i __y)
{
return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
}
/// Stores a 64-bit double-precision value in a 64-bit memory location.
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
///
/// \param __p
/// The 64-bit memory location used to store the register value.
/// \param __a
/// The 64-bit double-precision floating-point register value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_sd(void *__p, __m128d __a)
{
__builtin_ia32_movntsd((double *)__p, (__v2df)__a);
}
/// Stores a 32-bit single-precision floating-point value in a 32-bit
/// memory location. To minimize caching, the data is flagged as
/// non-temporal (unlikely to be used again soon).
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
///
/// \param __p
/// The 32-bit memory location used to store the register value.
/// \param __a
/// The 32-bit single-precision floating-point register value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ss(void *__p, __m128 __a)
{
__builtin_ia32_movntss((float *)__p, (__v4sf)__a);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __AMMINTRIN_H */

View file

@ -0,0 +1,169 @@
/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===------------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H
#ifndef __AMX_COMPLEXINTRIN_H
#define __AMX_COMPLEXINTRIN_H
#ifdef __x86_64__
#define __DEFAULT_FN_ATTRS_COMPLEX \
__attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
/// Perform matrix multiplication of two tiles containing complex elements and
/// accumulate the results into a packed single precision tile. Each dword
/// element in input tiles \a a and \a b is interpreted as a complex number
/// with FP16 real part and FP16 imaginary part.
/// Calculates the imaginary part of the result. For each possible combination
/// of (row of \a a, column of \a b), it performs a set of multiplication
/// and accumulations on all corresponding complex numbers (one from \a a
/// and one from \a b). The imaginary part of the \a a element is multiplied
/// with the real part of the corresponding \a b element, and the real part
/// of the \a a element is multiplied with the imaginary part of the
/// corresponding \a b elements. The two accumulated results are added, and
/// then accumulated into the corresponding row and column of \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
/// tmp := dst.row[m]
/// FOR k := 0 TO (a.colsb / 4) - 1
/// FOR n := 0 TO (dst.colsb / 4) - 1
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
/// ENDFOR
/// ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// ENDFOR
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param a
/// The 1st source tile. Max size is 1024 Bytes.
/// \param b
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
/// Perform matrix multiplication of two tiles containing complex elements and
/// accumulate the results into a packed single precision tile. Each dword
/// element in input tiles \a a and \a b is interpreted as a complex number
/// with FP16 real part and FP16 imaginary part.
/// Calculates the real part of the result. For each possible combination
/// of (row of \a a, column of \a b), it performs a set of multiplication
/// and accumulations on all corresponding complex numbers (one from \a a
/// and one from \a b). The real part of the \a a element is multiplied
/// with the real part of the corresponding \a b element, and the negated
/// imaginary part of the \a a element is multiplied with the imaginary
/// part of the corresponding \a b elements. The two accumulated results
/// are added, and then accumulated into the corresponding row and column
/// of \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
/// tmp := dst.row[m]
/// FOR k := 0 TO (a.colsb / 4) - 1
/// FOR n := 0 TO (dst.colsb / 4) - 1
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
/// ENDFOR
/// ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// ENDFOR
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param a
/// The 1st source tile. Max size is 1024 Bytes.
/// \param b
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
}
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
}
/// Perform matrix multiplication of two tiles containing complex elements and
/// accumulate the results into a packed single precision tile. Each dword
/// element in input tiles src0 and src1 is interpreted as a complex number with
/// FP16 real part and FP16 imaginary part.
/// This function calculates the imaginary part of the result.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_COMPLEX
static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
dst->tile, src0.tile, src1.tile);
}
/// Perform matrix multiplication of two tiles containing complex elements and
/// accumulate the results into a packed single precision tile. Each dword
/// element in input tiles src0 and src1 is interpreted as a complex number with
/// FP16 real part and FP16 imaginary part.
/// This function calculates the real part of the result.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_COMPLEX
static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
dst->tile, src0.tile, src1.tile);
}
#endif // __x86_64__
#endif // __AMX_COMPLEXINTRIN_H

58
third_party/intel/clang/amxfp16intrin.h vendored Normal file
View file

@ -0,0 +1,58 @@
/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===------------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
#endif /* __IMMINTRIN_H */
#ifndef __AMX_FP16INTRIN_H
#define __AMX_FP16INTRIN_H
#ifdef __x86_64__
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
/// and \a b, accumulating the intermediate single-precision (32-bit)
/// floating-point elements with elements in \a dst, and store the 32-bit
/// result back to tile \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
/// tmp := dst.row[m]
/// FOR k := 0 TO (a.colsb / 4) - 1
/// FOR n := 0 TO (dst.colsb / 4) - 1
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
/// FP32(b.row[k].fp16[2*n+0])
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
/// FP32(b.row[k].fp16[2*n+1])
/// ENDFOR
/// ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// ENDFOR
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TDPFP16PS instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param a
/// The 1st source tile. Max size is 1024 Bytes.
/// \param b
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpfp16ps(dst, a, b) \
__builtin_ia32_tdpfp16ps(dst, a, b)
#endif /* __x86_64__ */
#endif /* __AMX_FP16INTRIN_H */

524
third_party/intel/clang/amxintrin.h vendored Normal file
View file

@ -0,0 +1,524 @@
/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===------------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
#endif /* __IMMINTRIN_H */
#ifndef __AMXINTRIN_H
#define __AMXINTRIN_H
#ifdef __x86_64__
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS_TILE \
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
#define __DEFAULT_FN_ATTRS_INT8 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
#define __DEFAULT_FN_ATTRS_BF16 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
#define __DEFAULT_FN_ATTRS_FP16 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
/// Load tile configuration from a 64-byte memory location specified by
/// "mem_addr". The tile configuration includes the tile type palette, the
/// number of bytes per row, and the number of rows. If the specified
/// palette_id is zero, that signifies the init state for both the tile
/// config and the tile data, and the tiles are zeroed. Any invalid
/// configurations will result in #GP fault.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
///
/// \param __config
/// A pointer to 512-bits configuration
static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_loadconfig(const void *__config) {
__builtin_ia32_tile_loadconfig(__config);
}
/// Stores the current tile configuration to a 64-byte memory location
/// specified by "mem_addr". The tile configuration includes the tile type
/// palette, the number of bytes per row, and the number of rows. If tiles
/// are not configured, all zeroes will be stored to memory.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
///
/// \param __config
/// A pointer to 512-bits configuration
static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_storeconfig(void *__config) {
__builtin_ia32_tile_storeconfig(__config);
}
/// Release the tile configuration to return to the init state, which
/// releases all storage it currently holds.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
__builtin_ia32_tilerelease();
}
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst" using the tile configuration previously configured
/// via "_tile_loadconfig".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
#define _tile_loadd(dst, base, stride) \
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
(__SIZE_TYPE__)(stride))
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst" using the tile configuration previously configured
/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
/// that the data will likely not be reused in the near future and the data
/// caching can be optimized accordingly.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
#define _tile_stream_loadd(dst, base, stride) \
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
(__SIZE_TYPE__)(stride))
/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride" using the tile configuration previously configured via
/// "_tile_loadconfig".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be stored in memory.
#define _tile_stored(dst, base, stride) \
__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
/// Zero the tile specified by "tdest".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
///
/// \param tile
/// The destination tile to be zero. Max size is 1024 Bytes.
#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbssd(dst, src0, src1) \
__builtin_ia32_tdpbssd((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbsud(dst, src0, src1) \
__builtin_ia32_tdpbsud((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbusd(dst, src0, src1) \
__builtin_ia32_tdpbusd((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
/// "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbuud(dst, src0, src1) \
__builtin_ia32_tdpbuud((dst), (src0), (src1))
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbf16ps(dst, src0, src1) \
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
/// AMX tile register size can be configured, the maximum size is 16x64=1024
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
/// represent 2D tile and the fixed size is maximum amx tile register size.
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
__SIZE_TYPE__ stride) {
return __builtin_ia32_tileloadd64_internal(m, n, base,
(__SIZE_TYPE__)(stride));
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
__SIZE_TYPE__ stride) {
return __builtin_ia32_tileloaddt164_internal(m, n, base,
(__SIZE_TYPE__)(stride));
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
__SIZE_TYPE__ stride, _tile1024i tile) {
return __builtin_ia32_tilestored64_internal(m, n, base,
(__SIZE_TYPE__)(stride), tile);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
}
/// This struct pack the shape and tile data together for user. We suggest
/// initializing the struct as early as possible, because compiler depends
/// on the shape information to do configure. The constant value is preferred
/// for optimization by compiler.
typedef struct __tile1024i_str {
const unsigned short row;
const unsigned short col;
_tile1024i tile;
} __tile1024i;
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
__SIZE_TYPE__ stride) {
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
}
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst". This intrinsic provides a hint to the implementation
/// that the data will likely not be reused in the near future and the data
/// caching can be optimized accordingly.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
__SIZE_TYPE__ stride) {
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
/// "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
///
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be stored in memory.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
__tile1024i src) {
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
}
/// Zero the tile specified by "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
///
/// \param dst
/// The destination tile to be zero. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_zero(__tile1024i *dst) {
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
}
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_BF16
static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_FP16
static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
#undef __DEFAULT_FN_ATTRS_TILE
#undef __DEFAULT_FN_ATTRS_INT8
#undef __DEFAULT_FN_ATTRS_BF16
#undef __DEFAULT_FN_ATTRS_FP16
#endif /* __x86_64__ */
#endif /* __AMXINTRIN_H */

5284
third_party/intel/clang/avx2intrin.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,283 @@
/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
#endif
#ifdef __SSE2__
#ifndef __AVX512BF16INTRIN_H
#define __AVX512BF16INTRIN_H
typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
#define __DEFAULT_FN_ATTRS512 \
__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
__min_vector_width__(512)))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512bf16,no-evex512")))
/// Convert One BF16 Data to One Single Float Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic does not correspond to a specific instruction.
///
/// \param __A
/// A bfloat data.
/// \returns A float data whose sign field and exponent field keep unchanged,
/// and fraction field is extended to 23 bits.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
return __builtin_ia32_cvtsbf162ss_32(__A);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __B
/// A 512-bit vector of [16 x float].
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
(__v16sf) __B);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __B
/// A 512-bit vector of [16 x float].
/// \param __W
/// A 512-bit vector of [32 x bfloat].
/// \param __U
/// A 32-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element from __W.
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
(__v32bf)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __B
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 32-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element is zero.
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
(__v32bf)_mm512_setzero_si512());
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_cvtneps_pbh(__m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16bf)_mm256_undefined_si256(),
(__mmask16)-1);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __W
/// A 256-bit vector of [16 x bfloat].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element from __W.
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16bf)__W,
(__mmask16)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element is zero.
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16bf)_mm256_setzero_si256(),
(__mmask16)__U);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 512-bit vector of [32 x bfloat].
/// \param __B
/// A 512-bit vector of [32 x bfloat].
/// \param __D
/// A 512-bit vector of [16 x float].
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
(__v32bf) __A,
(__v32bf) __B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 512-bit vector of [32 x bfloat].
/// \param __B
/// A 512-bit vector of [32 x bfloat].
/// \param __D
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
(__v16sf)__D);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 512-bit vector of [32 x bfloat].
/// \param __B
/// A 512-bit vector of [32 x bfloat].
/// \param __D
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
(__v16sf)_mm512_setzero_si512());
}
/// Convert Packed BF16 Data to Packed float Data.
///
/// \headerfile <x86intrin.h>
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __U
/// A 16-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
(__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __S
/// A 512-bit vector of [16 x float]. Elements are copied from __S when
/// the corresponding mask bit is not set.
/// \param __U
/// A 16-bit mask.
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
(__m512i)__S, (__mmask16)__U,
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS512
#endif
#endif

View file

@ -0,0 +1,86 @@
/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512BITALGINTRIN_H
#define __AVX512BITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512bitalg,evex512"), \
__min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_popcnt_epi16(__m512i __A)
{
return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
{
return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
(__v32hi) _mm512_popcnt_epi16(__B),
(__v32hi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
{
return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
__U,
__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_popcnt_epi8(__m512i __A)
{
return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
{
return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
(__v64qi) _mm512_popcnt_epi8(__B),
(__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
{
return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
__U,
__B);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
{
return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
(__v64qi) __B,
__U);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
{
return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
__A,
__B);
}
#undef __DEFAULT_FN_ATTRS
#endif

2014
third_party/intel/clang/avx512bwintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

125
third_party/intel/clang/avx512cdintrin.h vendored Normal file
View file

@ -0,0 +1,125 @@
/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512CDINTRIN_H
#define __AVX512CDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512cd,evex512"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_conflict_epi64(__A),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_conflict_epi64(__A),
(__v8di)_mm512_setzero_si512 ());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_conflict_epi32(__A),
(__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_conflict_epi32(__A),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_lzcnt_epi32(__A),
(__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_lzcnt_epi32(__A),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_lzcnt_epi64(__A),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_lzcnt_epi64(__A),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcastmb_epi64 (__mmask8 __A)
{
return (__m512i) _mm512_set1_epi64((long long) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcastmw_epi32 (__mmask16 __A)
{
return (__m512i) _mm512_set1_epi32((int) __A);
}
#undef __DEFAULT_FN_ATTRS
#endif

1379
third_party/intel/clang/avx512dqintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

271
third_party/intel/clang/avx512erintrin.h vendored Normal file
View file

@ -0,0 +1,271 @@
/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512ERINTRIN_H
#define __AVX512ERINTRIN_H
/* exp2a23 */
#define _mm512_exp2a23_round_pd(A, R) \
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)))
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)))
#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)))
#define _mm512_exp2a23_pd(A) \
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_exp2a23_pd(S, M, A) \
_mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_exp2a23_pd(M, A) \
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_exp2a23_round_ps(A, R) \
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)))
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)))
#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)))
#define _mm512_exp2a23_ps(A) \
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_exp2a23_ps(S, M, A) \
_mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_exp2a23_ps(M, A) \
_mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
/* rsqrt28 */
#define _mm512_rsqrt28_round_pd(A, R) \
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)))
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)))
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)))
#define _mm512_rsqrt28_pd(A) \
_mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rsqrt28_pd(S, M, A) \
_mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rsqrt28_pd(M, A) \
_mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_rsqrt28_round_ps(A, R) \
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)))
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)))
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)))
#define _mm512_rsqrt28_ps(A) \
_mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rsqrt28_ps(S, M, A) \
_mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rsqrt28_ps(M, A) \
_mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_ss(A, B, R) \
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R)))
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R)))
#define _mm_rsqrt28_ss(A, B) \
_mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_mask_rsqrt28_ss(S, M, A, B) \
_mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_rsqrt28_ss(M, A, B) \
_mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_sd(A, B, R) \
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R)))
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R)))
#define _mm_rsqrt28_sd(A, B) \
_mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_mask_rsqrt28_sd(S, M, A, B) \
_mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_rsqrt28_sd(M, A, B) \
_mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
/* rcp28 */
#define _mm512_rcp28_round_pd(A, R) \
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)))
#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)))
#define _mm512_maskz_rcp28_round_pd(M, A, R) \
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)))
#define _mm512_rcp28_pd(A) \
_mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rcp28_pd(S, M, A) \
_mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rcp28_pd(M, A) \
_mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_rcp28_round_ps(A, R) \
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)))
#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)))
#define _mm512_maskz_rcp28_round_ps(M, A, R) \
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)))
#define _mm512_rcp28_ps(A) \
_mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rcp28_ps(S, M, A) \
_mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rcp28_ps(M, A) \
_mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_ss(A, B, R) \
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R)))
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R)))
#define _mm_rcp28_ss(A, B) \
_mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_mask_rcp28_ss(S, M, A, B) \
_mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_rcp28_ss(M, A, B) \
_mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_sd(A, B, R) \
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R)))
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R)))
#define _mm_rcp28_sd(A, B) \
_mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_mask_rcp28_sd(S, M, A, B) \
_mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_rcp28_sd(M, A, B) \
_mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#endif /* __AVX512ERINTRIN_H */

9779
third_party/intel/clang/avx512fintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

3352
third_party/intel/clang/avx512fp16intrin.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,70 @@
/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __IFMAINTRIN_H
#define __IFMAINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512ifma,evex512"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
(__v8di) __Z);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
(__v8di) __Z);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
(__v8di)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS
#endif

View file

@ -0,0 +1,111 @@
/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __IFMAVLINTRIN_H
#define __IFMAVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512ifma,avx512vl,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512ifma,avx512vl,no-evex512"), \
__min_vector_width__(256)))
#define _mm_madd52hi_epu64(X, Y, Z) \
((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y), \
(__v2di)(Z)))
#define _mm256_madd52hi_epu64(X, Y, Z) \
((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y), \
(__v4di)(Z)))
#define _mm_madd52lo_epu64(X, Y, Z) \
((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y), \
(__v2di)(Z)))
#define _mm256_madd52lo_epu64(X, Y, Z) \
((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y), \
(__v4di)(Z)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
(__v4di)_mm256_setzero_si256());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View file

@ -0,0 +1,92 @@
/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512PFINTRIN_H
#define __AVX512PFINTRIN_H
#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
(void const *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
__builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
(void const *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfdps((__mmask16)(mask), \
(__v16si)(__m512i)(index), (void const *)(addr), \
(int)(scale), (int)(hint))
#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
__builtin_ia32_gatherpfdps((__mmask16) -1, \
(__v16si)(__m512i)(index), (void const *)(addr), \
(int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
(void const *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
__builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
(void const *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
(void const *)(addr), (int)(scale), (int)(hint))
#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
__builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
(void const *)(addr), (int)(scale), (int)(hint))
#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
__builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
(void *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
(void *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
__builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
(void *)(addr), (int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfdps((__mmask16)(mask), \
(__v16si)(__m512i)(index), (void *)(addr), \
(int)(scale), (int)(hint))
#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
__builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
(void *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
(void *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
__builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
(void *)(addr), (int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
(void *)(addr), (int)(scale), (int)(hint))
#endif

View file

@ -0,0 +1,357 @@
/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VBMI2INTRIN_H
#define __AVX512VBMI2INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2,evex512"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
(__v32hi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
(__v32hi) _mm512_setzero_si512(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
(__v64qi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
(__v64qi) _mm512_setzero_si512(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
{
__builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
{
__builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
(__v32hi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
(__v32hi) _mm512_setzero_si512(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
(__v64qi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
(__v64qi) _mm512_setzero_si512(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
(__v32hi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
(__v32hi) _mm512_setzero_si512(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
(__v64qi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
(__v64qi) _mm512_setzero_si512(),
__U);
}
#define _mm512_shldi_epi64(A, B, I) \
((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I)))
#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
(__v8di)(__m512i)(S)))
#define _mm512_maskz_shldi_epi64(U, A, B, I) \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
(__v8di)_mm512_setzero_si512()))
#define _mm512_shldi_epi32(A, B, I) \
((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I)))
#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
(__v16si)(__m512i)(S)))
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
(__v16si)_mm512_setzero_si512()))
#define _mm512_shldi_epi16(A, B, I) \
((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B), (int)(I)))
#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
(__v32hi)(__m512i)(S)))
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
(__v32hi)_mm512_setzero_si512()))
#define _mm512_shrdi_epi64(A, B, I) \
((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I)))
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
(__v8di)(__m512i)(S)))
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
(__v8di)_mm512_setzero_si512()))
#define _mm512_shrdi_epi32(A, B, I) \
((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I)))
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
(__v16si)(__m512i)(S)))
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
(__v16si)_mm512_setzero_si512()))
#define _mm512_shrdi_epi16(A, B, I) \
((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B), (int)(I)))
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
(__v32hi)(__m512i)(S)))
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
(__v32hi)_mm512_setzero_si512()))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B,
(__v8di)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectq_512(__U,
(__v8di)_mm512_shldv_epi64(__A, __B, __C),
(__v8di)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectq_512(__U,
(__v8di)_mm512_shldv_epi64(__A, __B, __C),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_shldv_epi32(__A, __B, __C),
(__v16si)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_shldv_epi32(__A, __B, __C),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B,
(__v32hi)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectw_512(__U,
(__v32hi)_mm512_shldv_epi16(__A, __B, __C),
(__v32hi)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectw_512(__U,
(__v32hi)_mm512_shldv_epi16(__A, __B, __C),
(__v32hi)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B,
(__v8di)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectq_512(__U,
(__v8di)_mm512_shrdv_epi64(__A, __B, __C),
(__v8di)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectq_512(__U,
(__v8di)_mm512_shrdv_epi64(__A, __B, __C),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
{
return (__m512i) __builtin_ia32_selectd_512(__U,
(__v16si)_mm512_shrdv_epi32(__A, __B, __C),
(__v16si)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i) __builtin_ia32_selectd_512(__U,
(__v16si)_mm512_shrdv_epi32(__A, __B, __C),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B,
(__v32hi)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectw_512(__U,
(__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
(__v32hi)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectw_512(__U,
(__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
(__v32hi)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS
#endif

View file

@ -0,0 +1,106 @@
/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __VBMIINTRIN_H
#define __VBMIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vbmi,evex512"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
{
return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
(__v64qi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
__m512i __B)
{
return (__m512i)__builtin_ia32_selectb_512(__U,
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
(__v64qi)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
__m512i __B)
{
return (__m512i)__builtin_ia32_selectb_512(__U,
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
(__v64qi)__I);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
__m512i __B)
{
return (__m512i)__builtin_ia32_selectb_512(__U,
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
(__v64qi)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
__m512i __B)
{
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_permutexvar_epi8(__A, __B),
(__v64qi)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
__m512i __B)
{
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_permutexvar_epi8(__A, __B),
(__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y)
{
return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X,
__m512i __Y)
{
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
(__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
{
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
(__v64qi)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS
#endif

View file

@ -0,0 +1,193 @@
/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __VBMIVLINTRIN_H
#define __VBMIVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vbmi,avx512vl,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vbmi,avx512vl,no-evex512"), \
__min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
{
return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
(__v16qi)__I,
(__v16qi)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
__m128i __B)
{
return (__m128i)__builtin_ia32_selectb_128(__U,
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
(__v16qi)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
__m128i __B)
{
return (__m128i)__builtin_ia32_selectb_128(__U,
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
(__v16qi)__I);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
__m128i __B)
{
return (__m128i)__builtin_ia32_selectb_128(__U,
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
(__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
{
return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
(__v32qi)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
__m256i __B)
{
return (__m256i)__builtin_ia32_selectb_256(__U,
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
(__v32qi)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
__m256i __B)
{
return (__m256i)__builtin_ia32_selectb_256(__U,
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
(__v32qi)__I);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
__m256i __B)
{
return (__m256i)__builtin_ia32_selectb_256(__U,
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
(__v32qi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
(__v16qi)_mm_permutexvar_epi8(__A, __B),
(__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
__m128i __B)
{
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
(__v16qi)_mm_permutexvar_epi8(__A, __B),
(__v16qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
__m256i __B)
{
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
(__v32qi)_mm256_permutexvar_epi8(__A, __B),
(__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
__m256i __B)
{
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
(__v32qi)_mm256_permutexvar_epi8(__A, __B),
(__v32qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X,
__m128i __Y)
{
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
(__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
(__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
(__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
(__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X,
__m256i __Y)
{
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
(__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
(__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
(__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
(__v32qi)_mm256_setzero_si256());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View file

@ -0,0 +1,517 @@
/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
#endif
#ifdef __SSE2__
#ifndef __AVX512VLBF16INTRIN_H
#define __AVX512VLBF16INTRIN_H
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512bf16,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512bf16,no-evex512"), \
__min_vector_width__(256)))
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __B
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
(__v4sf) __B);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __B
/// A 128-bit vector of [4 x float].
/// \param __W
/// A 128-bit vector of [8 x bfloat].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element from __W.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
(__v8bf)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __B
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element is zero.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
(__v8bf)_mm_setzero_si128());
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __B
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
(__v8sf) __B);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __B
/// A 256-bit vector of [8 x float].
/// \param __W
/// A 256-bit vector of [16 x bfloat].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element from __W.
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
(__v16bf)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __B
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element is zero.
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
(__v16bf)_mm256_setzero_si256());
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __A, and higher 64 bits are 0.
#define _mm_cvtneps_pbh(A) \
((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __W
/// A 128-bit vector of [8 x bfloat].
/// \param __U
/// A 4-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element from __W.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __A, and higher 64 bits are 0.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
(__v8bf)__W,
(__mmask8)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 4-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element is zero.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __A, and higher 64 bits are 0.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
(__v8bf)_mm_setzero_si128(),
(__mmask8)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
#define _mm256_cvtneps_pbh(A) \
((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __W
/// A 256-bit vector of [8 x bfloat].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element from __W.
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
(__v8bf)__W,
(__mmask8)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element is zero.
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
(__v8bf)_mm_setzero_si128(),
(__mmask8)__U);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \param __B
/// A 128-bit vector of [8 x bfloat].
/// \param __D
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
(__v8bf)__A,
(__v8bf)__B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \param __B
/// A 128-bit vector of [8 x bfloat].
/// \param __D
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
(__v4sf)__D);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \param __B
/// A 128-bit vector of [8 x bfloat].
/// \param __D
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
(__v4sf)_mm_setzero_si128());
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \param __B
/// A 256-bit vector of [16 x bfloat].
/// \param __D
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
(__v16bf)__A,
(__v16bf)__B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \param __B
/// A 256-bit vector of [16 x bfloat].
/// \param __D
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
(__v8sf)__D);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \param __B
/// A 256-bit vector of [16 x bfloat].
/// \param __D
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
(__v8sf)_mm256_setzero_si256());
}
/// Convert One Single float Data to One BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A float data.
/// \returns A bf16 data whose sign field and exponent field keep unchanged,
/// and fraction field is truncated to 7 bits.
static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
__v4sf __V = {__A, 0, 0, 0};
__v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
(__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
return (__bf16)__R[0];
}
/// Convert Packed BF16 Data to Packed float Data.
///
/// \headerfile <x86intrin.h>
///
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
return _mm_castsi128_ps(
(__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data.
///
/// \headerfile <x86intrin.h>
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
(__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __U
/// A 4-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
(__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __U
/// A 8-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
(__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __S
/// A 128-bit vector of [4 x float]. Elements are copied from __S when
/// the corresponding mask bit is not set.
/// \param __U
/// A 4-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
(__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
16));
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __S
/// A 256-bit vector of [8 x float]. Elements are copied from __S when
/// the corresponding mask bit is not set.
/// \param __U
/// A 8-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
(__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
16));
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif
#endif

View file

@ -0,0 +1,151 @@
/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLBITALGINTRIN_H
#define __AVX512VLBITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512bitalg,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512bitalg,no-evex512"), \
__min_vector_width__(256)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi16(__m256i __A)
{
return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
{
return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
(__v16hi) _mm256_popcnt_epi16(__B),
(__v16hi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
{
return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
__U,
__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_popcnt_epi16(__m128i __A)
{
return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
{
return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
(__v8hi) _mm_popcnt_epi16(__B),
(__v8hi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
{
return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
__U,
__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi8(__m256i __A)
{
return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
{
return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
(__v32qi) _mm256_popcnt_epi8(__B),
(__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
{
return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
__U,
__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_popcnt_epi8(__m128i __A)
{
return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
{
return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
(__v16qi) _mm_popcnt_epi8(__B),
(__v16qi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
{
return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
__U,
__B);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
{
return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
(__v32qi) __B,
__U);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
{
return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
__A,
__B);
}
static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
{
return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
(__v16qi) __B,
__U);
}
static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
{
return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
__A,
__B);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

3167
third_party/intel/clang/avx512vlbwintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,230 @@
/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLCDINTRIN_H
#define __AVX512VLCDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512cd,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512cd,no-evex512"), \
__min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_broadcastmb_epi64 (__mmask8 __A)
{
return (__m128i) _mm_set1_epi64x((long long) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastmb_epi64 (__mmask8 __A)
{
return (__m256i) _mm256_set1_epi64x((long long)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_broadcastmw_epi32 (__mmask16 __A)
{
return (__m128i) _mm_set1_epi32((int)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastmw_epi32 (__mmask16 __A)
{
return (__m256i) _mm256_set1_epi32((int)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_conflict_epi64 (__m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_conflict_epi64(__A),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_conflict_epi64(__A),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_conflict_epi64 (__m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_conflict_epi64(__A),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_conflict_epi64(__A),
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_conflict_epi32 (__m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_conflict_epi32(__A),
(__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_conflict_epi32(__A),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_conflict_epi32 (__m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_conflict_epi32(__A),
(__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_conflict_epi32(__A),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_lzcnt_epi32 (__m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_lzcnt_epi32(__A),
(__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_lzcnt_epi32(__A),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_lzcnt_epi32 (__m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_lzcnt_epi32(__A),
(__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_lzcnt_epi32(__A),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_lzcnt_epi64 (__m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_lzcnt_epi64(__A),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_lzcnt_epi64(__A),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_lzcnt_epi64 (__m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_lzcnt_epi64(__A),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_lzcnt_epi64(__A),
(__v4di)_mm256_setzero_si256());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif /* __AVX512VLCDINTRIN_H */

1173
third_party/intel/clang/avx512vldqintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

8437
third_party/intel/clang/avx512vlintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,695 @@
/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLVBMI2INTRIN_H
#define __AVX512VLVBMI2INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vbmi2,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vbmi2,no-evex512"), \
__min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
(__v8hi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
(__v16qi) _mm_setzero_si128(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
{
__builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
{
__builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
(__v8hi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
(__v16qi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
(__v8hi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
(__v16qi) _mm_setzero_si128(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
(__v16hi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
(__v16hi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
(__v32qi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
(__v32qi) _mm256_setzero_si256(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
{
__builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
{
__builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
(__v16hi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
(__v16hi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
(__v32qi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
(__v32qi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
(__v16hi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
(__v16hi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
(__v32qi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
(__v32qi) _mm256_setzero_si256(),
__U);
}
#define _mm256_shldi_epi64(A, B, I) \
((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(I)))
#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
(__v4di)(__m256i)(S)))
#define _mm256_maskz_shldi_epi64(U, A, B, I) \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
(__v4di)_mm256_setzero_si256()))
#define _mm_shldi_epi64(A, B, I) \
((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(I)))
#define _mm_mask_shldi_epi64(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
(__v2di)(__m128i)(S)))
#define _mm_maskz_shldi_epi64(U, A, B, I) \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
(__v2di)_mm_setzero_si128()))
#define _mm256_shldi_epi32(A, B, I) \
((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(I)))
#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
(__v8si)(__m256i)(S)))
#define _mm256_maskz_shldi_epi32(U, A, B, I) \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
(__v8si)_mm256_setzero_si256()))
#define _mm_shldi_epi32(A, B, I) \
((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(I)))
#define _mm_mask_shldi_epi32(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
(__v4si)(__m128i)(S)))
#define _mm_maskz_shldi_epi32(U, A, B, I) \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
(__v4si)_mm_setzero_si128()))
#define _mm256_shldi_epi16(A, B, I) \
((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B), (int)(I)))
#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
(__v16hi)(__m256i)(S)))
#define _mm256_maskz_shldi_epi16(U, A, B, I) \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
(__v16hi)_mm256_setzero_si256()))
#define _mm_shldi_epi16(A, B, I) \
((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (int)(I)))
#define _mm_mask_shldi_epi16(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
(__v8hi)(__m128i)(S)))
#define _mm_maskz_shldi_epi16(U, A, B, I) \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
(__v8hi)_mm_setzero_si128()))
#define _mm256_shrdi_epi64(A, B, I) \
((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(I)))
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
(__v4di)(__m256i)(S)))
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
(__v4di)_mm256_setzero_si256()))
#define _mm_shrdi_epi64(A, B, I) \
((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(I)))
#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
(__v2di)(__m128i)(S)))
#define _mm_maskz_shrdi_epi64(U, A, B, I) \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
(__v2di)_mm_setzero_si128()))
#define _mm256_shrdi_epi32(A, B, I) \
((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(I)))
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
(__v8si)(__m256i)(S)))
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
(__v8si)_mm256_setzero_si256()))
#define _mm_shrdi_epi32(A, B, I) \
((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(I)))
#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
(__v4si)(__m128i)(S)))
#define _mm_maskz_shrdi_epi32(U, A, B, I) \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
(__v4si)_mm_setzero_si128()))
#define _mm256_shrdi_epi16(A, B, I) \
((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B), (int)(I)))
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
(__v16hi)(__m256i)(S)))
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
(__v16hi)_mm256_setzero_si256()))
#define _mm_shrdi_epi16(A, B, I) \
((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (int)(I)))
#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
(__v8hi)(__m128i)(S)))
#define _mm_maskz_shrdi_epi16(U, A, B, I) \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
(__v8hi)_mm_setzero_si128()))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B,
(__v4di)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectq_256(__U,
(__v4di)_mm256_shldv_epi64(__A, __B, __C),
(__v4di)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectq_256(__U,
(__v4di)_mm256_shldv_epi64(__A, __B, __C),
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B,
(__v2di)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectq_128(__U,
(__v2di)_mm_shldv_epi64(__A, __B, __C),
(__v2di)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectq_128(__U,
(__v2di)_mm_shldv_epi64(__A, __B, __C),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B,
(__v8si)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_shldv_epi32(__A, __B, __C),
(__v8si)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_shldv_epi32(__A, __B, __C),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_shldv_epi32(__A, __B, __C),
(__v4si)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_shldv_epi32(__A, __B, __C),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B,
(__v16hi)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectw_256(__U,
(__v16hi)_mm256_shldv_epi16(__A, __B, __C),
(__v16hi)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectw_256(__U,
(__v16hi)_mm256_shldv_epi16(__A, __B, __C),
(__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B,
(__v8hi)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectw_128(__U,
(__v8hi)_mm_shldv_epi16(__A, __B, __C),
(__v8hi)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectw_128(__U,
(__v8hi)_mm_shldv_epi16(__A, __B, __C),
(__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B,
(__v4di)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectq_256(__U,
(__v4di)_mm256_shrdv_epi64(__A, __B, __C),
(__v4di)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectq_256(__U,
(__v4di)_mm256_shrdv_epi64(__A, __B, __C),
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B,
(__v2di)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectq_128(__U,
(__v2di)_mm_shrdv_epi64(__A, __B, __C),
(__v2di)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectq_128(__U,
(__v2di)_mm_shrdv_epi64(__A, __B, __C),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B,
(__v8si)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_shrdv_epi32(__A, __B, __C),
(__v8si)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_shrdv_epi32(__A, __B, __C),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_shrdv_epi32(__A, __B, __C),
(__v4si)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_shrdv_epi32(__A, __B, __C),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B,
(__v16hi)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectw_256(__U,
(__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
(__v16hi)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectw_256(__U,
(__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
(__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B,
(__v8hi)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectw_128(__U,
(__v8hi)_mm_shrdv_epi16(__A, __B, __C),
(__v8hi)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectw_128(__U,
(__v8hi)_mm_shrdv_epi16(__A, __B, __C),
(__v8hi)_mm_setzero_si128());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View file

@ -0,0 +1,310 @@
/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLVNNIINTRIN_H
#define __AVX512VLVNNIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vnni,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vnni,no-evex512"), \
__min_vector_width__(256)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
#define _mm256_dpbusd_epi32(S, A, B) \
((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
#define _mm256_dpbusds_epi32(S, A, B) \
((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
/// and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
#define _mm256_dpwssd_epi32(S, A, B) \
((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
/// using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
#define _mm256_dpwssds_epi32(S, A, B) \
((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
#define _mm_dpbusd_epi32(S, A, B) \
((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
#define _mm_dpbusds_epi32(S, A, B) \
((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
/// and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
#define _mm_dpwssd_epi32(S, A, B) \
((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
/// using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
#define _mm_dpwssds_epi32(S, A, B) \
((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View file

@ -0,0 +1,123 @@
/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlvp2intersectintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512VLVP2INTERSECT_H
#define _AVX512VLVP2INTERSECT_H
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vp2intersect,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vp2intersect,no-evex512"), \
__min_vector_width__(256)))
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between dwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x i32].
/// \param __b
/// A 256-bit vector of [8 x i32]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between quadwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x i64].
/// \param __b
/// A 256-bit vector of [4 x i64]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between dwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x i32].
/// \param __b
/// A 128-bit vector of [4 x i32]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between quadwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x i64].
/// \param __b
/// A 128-bit vector of [2 x i64]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View file

@ -0,0 +1,116 @@
/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VNNIINTRIN_H
#define __AVX512VNNIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vnni,evex512"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS
#endif

View file

@ -0,0 +1,78 @@
/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vp2intersect.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512VP2INTERSECT_H
#define _AVX512VP2INTERSECT_H
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vp2intersect,evex512"), \
__min_vector_width__(512)))
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between dwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
///
/// \param __a
/// A 512-bit vector of [16 x i32].
/// \param __b
/// A 512-bit vector of [16 x i32]
/// \param __m0
/// A pointer point to 16-bit mask
/// \param __m1
/// A pointer point to 16-bit mask
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) {
__builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between quadwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
///
/// \param __a
/// A 512-bit vector of [8 x i64].
/// \param __b
/// A 512-bit vector of [8 x i64]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1);
}
#undef __DEFAULT_FN_ATTRS
#endif

View file

@ -0,0 +1,56 @@
/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VPOPCNTDQINTRIN_H
#define __AVX512VPOPCNTDQINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vpopcntdq,evex512"), \
__min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
return (__m512i)__builtin_ia32_selectq_512(
(__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
return (__m512i)__builtin_ia32_selectd_512(
(__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
}
#undef __DEFAULT_FN_ATTRS
#endif

View file

@ -0,0 +1,95 @@
/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VPOPCNTDQVLINTRIN_H
#define __AVX512VPOPCNTDQVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vpopcntdq,avx512vl,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vpopcntdq,avx512vl,no-evex512"), \
__min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_popcnt_epi64(__m128i __A) {
return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
return (__m128i)__builtin_ia32_selectq_128(
(__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_popcnt_epi32(__m128i __A) {
return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
return (__m128i)__builtin_ia32_selectd_128(
(__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi64(__m256i __A) {
return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
return (__m256i)__builtin_ia32_selectq_256(
(__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi32(__m256i __A) {
return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
return (__m256i)__builtin_ia32_selectd_256(
(__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

177
third_party/intel/clang/avxifmaintrin.h vendored Normal file
View file

@ -0,0 +1,177 @@
/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVXIFMAINTRIN_H
#define __AVXIFMAINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \
__min_vector_width__(256)))
// must vex-encoding
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the corresponding
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i
/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
/// \endcode
///
/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
///
/// \return
/// return __m128i dst.
/// \param __X
/// A 128-bit vector of [2 x i64]
/// \param __Y
/// A 128-bit vector of [2 x i64]
/// \param __Z
/// A 128-bit vector of [2 x i64]
///
/// \code{.operation}
/// FOR j := 0 to 1
/// i := j*64
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
(__v2di)__Z);
}
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
/// unsigned integer from the intermediate result with the corresponding
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i
/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
/// \endcode
///
/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
///
/// \return
/// return __m256i dst.
/// \param __X
/// A 256-bit vector of [4 x i64]
/// \param __Y
/// A 256-bit vector of [4 x i64]
/// \param __Z
/// A 256-bit vector of [4 x i64]
///
/// \code{.operation}
/// FOR j := 0 to 3
/// i := j*64
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
(__v4di)__Z);
}
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the corresponding
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i
/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
/// \endcode
///
/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
///
/// \return
/// return __m128i dst.
/// \param __X
/// A 128-bit vector of [2 x i64]
/// \param __Y
/// A 128-bit vector of [2 x i64]
/// \param __Z
/// A 128-bit vector of [2 x i64]
///
/// \code{.operation}
/// FOR j := 0 to 1
/// i := j*64
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
(__v2di)__Z);
}
/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
/// unsigned integer from the intermediate result with the corresponding
/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i
/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
/// \endcode
///
/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
///
/// \return
/// return __m256i dst.
/// \param __X
/// A 256-bit vector of [4 x i64]
/// \param __Y
/// A 256-bit vector of [4 x i64]
/// \param __Z
/// A 256-bit vector of [4 x i64]
///
/// \code{.operation}
/// FOR j := 0 to 3
/// i := j*64
/// tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
/// dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
(__v4di)__Z);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif // __AVXIFMAINTRIN_H

5126
third_party/intel/clang/avxintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,484 @@
/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H
#ifdef __SSE2__
#ifndef __AVXNECONVERTINTRIN_H
#define __AVXNECONVERTINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
__min_vector_width__(256)))
/// Convert scalar BF16 (16-bit) floating-point element
/// stored at memory locations starting at location \a __A to a
/// single-precision (32-bit) floating-point, broadcast it to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_bcstnebf16_ps(const void *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
///
/// \param __A
/// A pointer to a 16-bit memory location. The address of the memory
/// location does not have to be aligned.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
/// FOR j := 0 to 3
/// m := j*32
/// dst[m+31:m] := b
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_bcstnebf16_ps(const void *__A) {
return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
}
/// Convert scalar BF16 (16-bit) floating-point element
/// stored at memory locations starting at location \a __A to a
/// single-precision (32-bit) floating-point, broadcast it to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_bcstnebf16_ps(const void *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
///
/// \param __A
/// A pointer to a 16-bit memory location. The address of the memory
/// location does not have to be aligned.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
/// FOR j := 0 to 7
/// m := j*32
/// dst[m+31:m] := b
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_bcstnebf16_ps(const void *__A) {
return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
}
/// Convert scalar half-precision (16-bit) floating-point element
/// stored at memory locations starting at location \a __A to a
/// single-precision (32-bit) floating-point, broadcast it to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_bcstnesh_ps(const void *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
///
/// \param __A
/// A pointer to a 16-bit memory location. The address of the memory
/// location does not have to be aligned.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
/// FOR j := 0 to 3
/// m := j*32
/// dst[m+31:m] := b
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_bcstnesh_ps(const void *__A) {
return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
}
/// Convert scalar half-precision (16-bit) floating-point element
/// stored at memory locations starting at location \a __A to a
/// single-precision (32-bit) floating-point, broadcast it to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_bcstnesh_ps(const void *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
///
/// \param __A
/// A pointer to a 16-bit memory location. The address of the memory
/// location does not have to be aligned.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
/// FOR j := 0 to 7
/// m := j*32
/// dst[m+31:m] := b
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_bcstnesh_ps(const void *__A) {
return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
}
/// Convert packed BF16 (16-bit) floating-point even-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_cvtneebf16_ps(const __m128bh *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
///
/// \param __A
/// A pointer to a 128-bit memory location containing 8 consecutive
/// BF16 (16-bit) floating-point values.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// k := j*2
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneebf16_ps(const __m128bh *__A) {
return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
}
/// Convert packed BF16 (16-bit) floating-point even-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_cvtneebf16_ps(const __m256bh *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
///
/// \param __A
/// A pointer to a 256-bit memory location containing 16 consecutive
/// BF16 (16-bit) floating-point values.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// k := j*2
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneebf16_ps(const __m256bh *__A) {
return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
}
/// Convert packed half-precision (16-bit) floating-point even-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_cvtneeph_ps(const __m128h *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
///
/// \param __A
/// A pointer to a 128-bit memory location containing 8 consecutive
/// half-precision (16-bit) floating-point values.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// k := j*2
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneeph_ps(const __m128h *__A) {
return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
}
/// Convert packed half-precision (16-bit) floating-point even-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_cvtneeph_ps(const __m256h *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
///
/// \param __A
/// A pointer to a 256-bit memory location containing 16 consecutive
/// half-precision (16-bit) floating-point values.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// k := j*2
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneeph_ps(const __m256h *__A) {
return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
}
/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_cvtneobf16_ps(const __m128bh *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
///
/// \param __A
/// A pointer to a 128-bit memory location containing 8 consecutive
/// BF16 (16-bit) floating-point values.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// k := j*2+1
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneobf16_ps(const __m128bh *__A) {
return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
}
/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_cvtneobf16_ps(const __m256bh *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
///
/// \param __A
/// A pointer to a 256-bit memory location containing 16 consecutive
/// BF16 (16-bit) floating-point values.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// k := j*2+1
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneobf16_ps(const __m256bh *__A) {
return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
}
/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_cvtneoph_ps(const __m128h *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
///
/// \param __A
/// A pointer to a 128-bit memory location containing 8 consecutive
/// half-precision (16-bit) floating-point values.
/// \returns
/// A 128-bit vector of [4 x float].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// k := j*2+1
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtneoph_ps(const __m128h *__A) {
return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
}
/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
/// stored at memory locations starting at location \a __A to packed
/// single-precision (32-bit) floating-point elements, and store the results in
/// \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_cvtneoph_ps(const __m256h *__A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
///
/// \param __A
/// A pointer to a 256-bit memory location containing 16 consecutive
/// half-precision (16-bit) floating-point values.
/// \returns
/// A 256-bit vector of [8 x float].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// k := j*2+1
/// i := k*16
/// m := j*32
/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtneoph_ps(const __m256h *__A) {
return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
}
/// Convert packed single-precision (32-bit) floating-point elements in \a __A
/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
/// dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_cvtneps_avx_pbh(__m128 __A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \returns
/// A 128-bit vector of [8 x bfloat].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_cvtneps_avx_pbh(__m128 __A) {
return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
}
/// Convert packed single-precision (32-bit) floating-point elements in \a __A
/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
/// dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_cvtneps_avx_pbh(__m256 __A);
/// \endcode
///
/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \returns
/// A 128-bit vector of [8 x bfloat].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_cvtneps_avx_pbh(__m256 __A) {
return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif // __AVXNECONVERTINTRIN_H
#endif // __SSE2__

View file

@ -0,0 +1,473 @@
/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H
#ifndef __AVXVNNIINT16INTRIN_H
#define __AVXVNNIINT16INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"), \
__min_vector_width__(256)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWSUD instruction.
///
/// \param __W
/// A 128-bit vector of [4 x int].
/// \param __A
/// A 128-bit vector of [8 x short].
/// \param __B
/// A 128-bit vector of [8 x unsigned short].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWSUD instruction.
///
/// \param __W
/// A 256-bit vector of [8 x int].
/// \param __A
/// A 256-bit vector of [16 x short].
/// \param __B
/// A 256-bit vector of [16 x unsigned short].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
///
/// \param __W
/// A 128-bit vector of [4 x int].
/// \param __A
/// A 128-bit vector of [8 x short].
/// \param __B
/// A 128-bit vector of [8 x unsigned short].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
///
/// \param __W
/// A 256-bit vector of [8 x int].
/// \param __A
/// A 256-bit vector of [16 x short].
/// \param __B
/// A 256-bit vector of [16 x unsigned short].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWUSD instruction.
///
/// \param __W
/// A 128-bit vector of [4 x int].
/// \param __A
/// A 128-bit vector of [8 x unsigned short].
/// \param __B
/// A 128-bit vector of [8 x short].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWUSD instruction.
///
/// \param __W
/// A 256-bit vector of [8 x int].
/// \param __A
/// A 256-bit vector of [16 x unsigned short].
/// \param __B
/// A 256-bit vector of [16 x short].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
///
/// \param __W
/// A 128-bit vector of [4 x int].
/// \param __A
/// A 128-bit vector of [8 x unsigned short].
/// \param __B
/// A 128-bit vector of [8 x short].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding signed 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
///
/// \param __W
/// A 256-bit vector of [8 x int].
/// \param __A
/// A 256-bit vector of [16 x unsigned short].
/// \param __B
/// A 256-bit vector of [16 x short].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWUUD instruction.
///
/// \param __W
/// A 128-bit vector of [4 x unsigned int].
/// \param __A
/// A 128-bit vector of [8 x unsigned short].
/// \param __B
/// A 128-bit vector of [8 x unsigned short].
/// \returns
/// A 128-bit vector of [4 x unsigned int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWUUD instruction.
///
/// \param __W
/// A 256-bit vector of [8 x unsigned int].
/// \param __A
/// A 256-bit vector of [16 x unsigned short].
/// \param __B
/// A 256-bit vector of [16 x unsigned short].
/// \returns
/// A 256-bit vector of [8 x unsigned int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
///
/// \param __W
/// A 128-bit vector of [4 x unsigned int].
/// \param __A
/// A 128-bit vector of [8 x unsigned short].
/// \param __B
/// A 128-bit vector of [8 x unsigned short].
/// \returns
/// A 128-bit vector of [4 x unsigned int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
/// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
/// signed 16-bit results. Sum these 2 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
///
/// \param __W
/// A 256-bit vector of [8 x unsigned int].
/// \param __A
/// A 256-bit vector of [16 x unsigned short].
/// \param __B
/// A 256-bit vector of [16 x unsigned short].
/// \returns
/// A 256-bit vector of [8 x unsigned int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
/// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif // __AVXVNNIINT16INTRIN_H

View file

@ -0,0 +1,471 @@
/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error \
"Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVXVNNIINT8INTRIN_H
#define __AVXVNNIINT8INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
__min_vector_width__(256)))
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
__min_vector_width__(128)))
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
/// \param __B
/// A 128-bit vector of [16 x char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
/// \param __B
/// A 256-bit vector of [32 x char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
/// \param __B
/// A 128-bit vector of [16 x char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
/// \param __B
/// A 256-bit vector of [32 x char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
/// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
/// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
/// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
/// \param __B
/// A 128-bit vector of [16 x unsigned char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
/// \param __B
/// A 256-bit vector of [32 x unsigned char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x char].
/// \param __B
/// A 128-bit vector of [16 x unsigned char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x char].
/// \param __B
/// A 256-bit vector of [32 x unsigned char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 128-bit vector of [16 x unsigned char].
/// \param __B
/// A 128-bit vector of [16 x unsigned char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
///
/// \param __A
/// A 256-bit vector of [32 x unsigned char].
/// \param __B
/// A 256-bit vector of [32 x unsigned char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
///
/// \param __A
/// A 128-bit vector of [16 x unsigned char].
/// \param __B
/// A 128-bit vector of [16 x unsigned char].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
__m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
(__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
/// signed 16-bit results. Sum these 4 results with the corresponding
/// 32-bit integer in \a __W with signed saturation, and store the packed
/// 32-bit results in \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
/// \endcode
///
/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
///
/// \param __A
/// A 256-bit vector of [32 x unsigned char].
/// \param __B
/// A 256-bit vector of [32 x unsigned char].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
/// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
/// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
/// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
(__v8si)__B);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif // __AVXVNNIINT8INTRIN_H

225
third_party/intel/clang/avxvnniintrin.h vendored Normal file
View file

@ -0,0 +1,225 @@
/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVXVNNIINTRIN_H
#define __AVXVNNIINTRIN_H
/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a __S, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
/// and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
/// using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a __S, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
/// and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
/// using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif // __AVXVNNIINTRIN_H

255
third_party/intel/clang/bmi2intrin.h vendored Normal file
View file

@ -0,0 +1,255 @@
/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <bmi2intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __BMI2INTRIN_H
#define __BMI2INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
/// Copies the unsigned 32-bit integer \a __X and zeroes the upper bits
/// starting at bit number \a __Y.
///
/// \code{.operation}
/// i := __Y[7:0]
/// result := __X
/// IF i < 32
/// result[31:i] := 0
/// FI
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c BZHI instruction.
///
/// \param __X
/// The 32-bit source value to copy.
/// \param __Y
/// The lower 8 bits specify the bit number of the lowest bit to zero.
/// \returns The partially zeroed 32-bit value.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_bzhi_u32(unsigned int __X, unsigned int __Y)
{
return __builtin_ia32_bzhi_si(__X, __Y);
}
/// Deposit (scatter) low-order bits from the unsigned 32-bit integer \a __X
/// into the 32-bit result, according to the mask in the unsigned 32-bit
/// integer \a __Y. All other bits of the result are zero.
///
/// \code{.operation}
/// i := 0
/// result := 0
/// FOR m := 0 TO 31
/// IF __Y[m] == 1
/// result[m] := __X[i]
/// i := i + 1
/// ENDIF
/// ENDFOR
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c PDEP instruction.
///
/// \param __X
/// The 32-bit source value to copy.
/// \param __Y
/// The 32-bit mask specifying where to deposit source bits.
/// \returns The 32-bit result.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_pdep_u32(unsigned int __X, unsigned int __Y)
{
return __builtin_ia32_pdep_si(__X, __Y);
}
/// Extract (gather) bits from the unsigned 32-bit integer \a __X into the
/// low-order bits of the 32-bit result, according to the mask in the
/// unsigned 32-bit integer \a __Y. All other bits of the result are zero.
///
/// \code{.operation}
/// i := 0
/// result := 0
/// FOR m := 0 TO 31
/// IF __Y[m] == 1
/// result[i] := __X[m]
/// i := i + 1
/// ENDIF
/// ENDFOR
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c PEXT instruction.
///
/// \param __X
/// The 32-bit source value to copy.
/// \param __Y
/// The 32-bit mask specifying which source bits to extract.
/// \returns The 32-bit result.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_pext_u32(unsigned int __X, unsigned int __Y)
{
return __builtin_ia32_pext_si(__X, __Y);
}
/// Multiplies the unsigned 32-bit integers \a __X and \a __Y to form a
/// 64-bit product. Stores the upper 32 bits of the product in the
/// memory at \a __P and returns the lower 32 bits.
///
/// \code{.operation}
/// Store32(__P, (__X * __Y)[63:32])
/// result := (__X * __Y)[31:0]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c MULX instruction.
///
/// \param __X
/// An unsigned 32-bit multiplicand.
/// \param __Y
/// An unsigned 32-bit multiplicand.
/// \param __P
/// A pointer to memory for storing the upper half of the product.
/// \returns The lower half of the product.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
{
unsigned long long __res = (unsigned long long) __X * __Y;
*__P = (unsigned int)(__res >> 32);
return (unsigned int)__res;
}
#ifdef __x86_64__
/// Copies the unsigned 64-bit integer \a __X and zeroes the upper bits
/// starting at bit number \a __Y.
///
/// \code{.operation}
/// i := __Y[7:0]
/// result := __X
/// IF i < 64
/// result[63:i] := 0
/// FI
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c BZHI instruction.
///
/// \param __X
/// The 64-bit source value to copy.
/// \param __Y
/// The lower 8 bits specify the bit number of the lowest bit to zero.
/// \returns The partially zeroed 64-bit value.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_bzhi_u64(unsigned long long __X, unsigned long long __Y)
{
return __builtin_ia32_bzhi_di(__X, __Y);
}
/// Deposit (scatter) low-order bits from the unsigned 64-bit integer \a __X
/// into the 64-bit result, according to the mask in the unsigned 64-bit
/// integer \a __Y. All other bits of the result are zero.
///
/// \code{.operation}
/// i := 0
/// result := 0
/// FOR m := 0 TO 63
/// IF __Y[m] == 1
/// result[m] := __X[i]
/// i := i + 1
/// ENDIF
/// ENDFOR
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c PDEP instruction.
///
/// \param __X
/// The 64-bit source value to copy.
/// \param __Y
/// The 64-bit mask specifying where to deposit source bits.
/// \returns The 64-bit result.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_pdep_u64(unsigned long long __X, unsigned long long __Y)
{
return __builtin_ia32_pdep_di(__X, __Y);
}
/// Extract (gather) bits from the unsigned 64-bit integer \a __X into the
/// low-order bits of the 64-bit result, according to the mask in the
/// unsigned 64-bit integer \a __Y. All other bits of the result are zero.
///
/// \code{.operation}
/// i := 0
/// result := 0
/// FOR m := 0 TO 63
/// IF __Y[m] == 1
/// result[i] := __X[m]
/// i := i + 1
/// ENDIF
/// ENDFOR
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c PEXT instruction.
///
/// \param __X
/// The 64-bit source value to copy.
/// \param __Y
/// The 64-bit mask specifying which source bits to extract.
/// \returns The 64-bit result.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_pext_u64(unsigned long long __X, unsigned long long __Y)
{
return __builtin_ia32_pext_di(__X, __Y);
}
/// Multiplies the unsigned 64-bit integers \a __X and \a __Y to form a
/// 128-bit product. Stores the upper 64 bits of the product to the
/// memory addressed by \a __P and returns the lower 64 bits.
///
/// \code{.operation}
/// Store64(__P, (__X * __Y)[127:64])
/// result := (__X * __Y)[63:0]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c MULX instruction.
///
/// \param __X
/// An unsigned 64-bit multiplicand.
/// \param __Y
/// An unsigned 64-bit multiplicand.
/// \param __P
/// A pointer to memory for storing the upper half of the product.
/// \returns The lower half of the product.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_mulx_u64 (unsigned long long __X, unsigned long long __Y,
unsigned long long *__P)
{
unsigned __int128 __res = (unsigned __int128) __X * __Y;
*__P = (unsigned long long) (__res >> 64);
return (unsigned long long) __res;
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#endif /* __BMI2INTRIN_H */

614
third_party/intel/clang/bmiintrin.h vendored Normal file
View file

@ -0,0 +1,614 @@
/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __BMIINTRIN_H
#define __BMIINTRIN_H
/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
instruction behaves as BSF on non-BMI targets, there is code that expects
to use it as a potentially faster version of BSF. */
#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c TZCNT instruction.
///
/// \param __X
/// An unsigned 16-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 16-bit integer containing the number of trailing zero
/// bits in the operand.
/// \see _tzcnt_u16
static __inline__ unsigned short __RELAXED_FN_ATTRS
__tzcnt_u16(unsigned short __X)
{
return __builtin_ia32_tzcnt_u16(__X);
}
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned short _tzcnt_u16(unsigned short __X);
/// \endcode
///
/// This intrinsic corresponds to the \c TZCNT instruction.
///
/// \param __X
/// An unsigned 16-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 16-bit integer containing the number of trailing zero
/// bits in the operand.
/// \see __tzcnt_u16
#define _tzcnt_u16 __tzcnt_u16
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c TZCNT instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of trailing zero
/// bits in the operand.
/// \see { _mm_tzcnt_32 _tzcnt_u32 }
static __inline__ unsigned int __RELAXED_FN_ATTRS
__tzcnt_u32(unsigned int __X)
{
return __builtin_ia32_tzcnt_u32(__X);
}
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c TZCNT instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
/// \returns A 32-bit integer containing the number of trailing zero bits in
/// the operand.
/// \see { __tzcnt_u32 _tzcnt_u32 }
static __inline__ int __RELAXED_FN_ATTRS
_mm_tzcnt_32(unsigned int __X)
{
return (int)__builtin_ia32_tzcnt_u32(__X);
}
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _tzcnt_u32(unsigned int __X);
/// \endcode
///
/// This intrinsic corresponds to the \c TZCNT instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of trailing zero
/// bits in the operand.
/// \see { _mm_tzcnt_32 __tzcnt_u32 }
#define _tzcnt_u32 __tzcnt_u32
#ifdef __x86_64__
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c TZCNT instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of trailing zero
/// bits in the operand.
/// \see { _mm_tzcnt_64 _tzcnt_u64 }
static __inline__ unsigned long long __RELAXED_FN_ATTRS
__tzcnt_u64(unsigned long long __X)
{
return __builtin_ia32_tzcnt_u64(__X);
}
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c TZCNT instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
/// \returns An 64-bit integer containing the number of trailing zero bits in
/// the operand.
/// \see { __tzcnt_u64 _tzcnt_u64 }
static __inline__ long long __RELAXED_FN_ATTRS
_mm_tzcnt_64(unsigned long long __X)
{
return (long long)__builtin_ia32_tzcnt_u64(__X);
}
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _tzcnt_u64(unsigned long long __X);
/// \endcode
///
/// This intrinsic corresponds to the \c TZCNT instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of trailing zero
/// bits in the operand.
/// \see { _mm_tzcnt_64 __tzcnt_u64
#define _tzcnt_u64 __tzcnt_u64
#endif /* __x86_64__ */
#undef __RELAXED_FN_ATTRS
#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__)
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
/// Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c ANDN instruction.
///
/// \param __X
/// An unsigned integer containing one of the operands.
/// \param __Y
/// An unsigned integer containing one of the operands.
/// \returns An unsigned integer containing the bitwise AND of the second
/// operand with the one's complement of the first operand.
/// \see _andn_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__andn_u32(unsigned int __X, unsigned int __Y)
{
return ~__X & __Y;
}
/// Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _andn_u32(unsigned int __X, unsigned int __Y);
/// \endcode
///
/// This intrinsic corresponds to the \c ANDN instruction.
///
/// \param __X
/// An unsigned integer containing one of the operands.
/// \param __Y
/// An unsigned integer containing one of the operands.
/// \returns An unsigned integer containing the bitwise AND of the second
/// operand with the one's complement of the first operand.
/// \see __andn_u32
#define _andn_u32 __andn_u32
/* AMD-specified, double-leading-underscore version of BEXTR */
/// Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BEXTR instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be extracted.
/// \param __Y
/// An unsigned integer used to specify which bits are extracted. Bits [7:0]
/// specify the index of the least significant bit. Bits [15:8] specify the
/// number of bits to be extracted.
/// \returns An unsigned integer whose least significant bits contain the
/// extracted bits.
/// \see _bextr_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__bextr_u32(unsigned int __X, unsigned int __Y)
{
return __builtin_ia32_bextr_u32(__X, __Y);
}
/* Intel-specified, single-leading-underscore version of BEXTR */
/// Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BEXTR instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be extracted.
/// \param __Y
/// An unsigned integer used to specify the index of the least significant
/// bit for the bits to be extracted. Bits [7:0] specify the index.
/// \param __Z
/// An unsigned integer used to specify the number of bits to be extracted.
/// Bits [7:0] specify the number of bits.
/// \returns An unsigned integer whose least significant bits contain the
/// extracted bits.
/// \see __bextr_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
{
return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
}
/* Intel-specified, single-leading-underscore version of BEXTR2 */
/// Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BEXTR instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be extracted.
/// \param __Y
/// An unsigned integer used to specify which bits are extracted. Bits [7:0]
/// specify the index of the least significant bit. Bits [15:8] specify the
/// number of bits to be extracted.
/// \returns An unsigned integer whose least significant bits contain the
/// extracted bits.
/// \see __bextr_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_bextr2_u32(unsigned int __X, unsigned int __Y) {
return __builtin_ia32_bextr_u32(__X, __Y);
}
/// Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BLSI instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be cleared.
/// \returns An unsigned integer containing the result of clearing the bits from
/// the source operand.
/// \see _blsi_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blsi_u32(unsigned int __X)
{
return __X & -__X;
}
/// Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _blsi_u32(unsigned int __X);
/// \endcode
///
/// This intrinsic corresponds to the \c BLSI instruction.
///
/// \param __X
/// An unsigned integer whose bits are to be cleared.
/// \returns An unsigned integer containing the result of clearing the bits from
/// the source operand.
/// \see __blsi_u32
#define _blsi_u32 __blsi_u32
/// Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BLSMSK instruction.
///
/// \param __X
/// An unsigned integer used to create the mask.
/// \returns An unsigned integer containing the newly created mask.
/// \see _blsmsk_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blsmsk_u32(unsigned int __X)
{
return __X ^ (__X - 1);
}
/// Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _blsmsk_u32(unsigned int __X);
/// \endcode
///
/// This intrinsic corresponds to the \c BLSMSK instruction.
///
/// \param __X
/// An unsigned integer used to create the mask.
/// \returns An unsigned integer containing the newly created mask.
/// \see __blsmsk_u32
#define _blsmsk_u32 __blsmsk_u32
/// Clears the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BLSR instruction.
///
/// \param __X
/// An unsigned integer containing the operand to be cleared.
/// \returns An unsigned integer containing the result of clearing the source
/// operand.
/// \see _blsr_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blsr_u32(unsigned int __X)
{
return __X & (__X - 1);
}
/// Clears the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _bls4_u32(unsigned int __X);
/// \endcode
///
/// This intrinsic corresponds to the \c BLSR instruction.
///
/// \param __X
/// An unsigned integer containing the operand to be cleared.
/// \returns An unsigned integer containing the result of clearing the source
/// operand.
/// \see __blsr_u32
#define _blsr_u32 __blsr_u32
#ifdef __x86_64__
/// Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c ANDN instruction.
///
/// \param __X
/// An unsigned 64-bit integer containing one of the operands.
/// \param __Y
/// An unsigned 64-bit integer containing one of the operands.
/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
/// operand with the one's complement of the first operand.
/// \see _andn_u64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__andn_u64 (unsigned long long __X, unsigned long long __Y)
{
return ~__X & __Y;
}
/// Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _andn_u64(unsigned long long __X,
/// unsigned long long __Y);
/// \endcode
///
/// This intrinsic corresponds to the \c ANDN instruction.
///
/// \param __X
/// An unsigned 64-bit integer containing one of the operands.
/// \param __Y
/// An unsigned 64-bit integer containing one of the operands.
/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
/// operand with the one's complement of the first operand.
/// \see __andn_u64
#define _andn_u64 __andn_u64
/* AMD-specified, double-leading-underscore version of BEXTR */
/// Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BEXTR instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be extracted.
/// \param __Y
/// An unsigned 64-bit integer used to specify which bits are extracted. Bits
/// [7:0] specify the index of the least significant bit. Bits [15:8] specify
/// the number of bits to be extracted.
/// \returns An unsigned 64-bit integer whose least significant bits contain the
/// extracted bits.
/// \see _bextr_u64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__bextr_u64(unsigned long long __X, unsigned long long __Y)
{
return __builtin_ia32_bextr_u64(__X, __Y);
}
/* Intel-specified, single-leading-underscore version of BEXTR */
/// Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BEXTR instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be extracted.
/// \param __Y
/// An unsigned integer used to specify the index of the least significant
/// bit for the bits to be extracted. Bits [7:0] specify the index.
/// \param __Z
/// An unsigned integer used to specify the number of bits to be extracted.
/// Bits [7:0] specify the number of bits.
/// \returns An unsigned 64-bit integer whose least significant bits contain the
/// extracted bits.
/// \see __bextr_u64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
{
return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
}
/* Intel-specified, single-leading-underscore version of BEXTR2 */
/// Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BEXTR instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be extracted.
/// \param __Y
/// An unsigned 64-bit integer used to specify which bits are extracted. Bits
/// [7:0] specify the index of the least significant bit. Bits [15:8] specify
/// the number of bits to be extracted.
/// \returns An unsigned 64-bit integer whose least significant bits contain the
/// extracted bits.
/// \see __bextr_u64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_bextr2_u64(unsigned long long __X, unsigned long long __Y) {
return __builtin_ia32_bextr_u64(__X, __Y);
}
/// Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BLSI instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be cleared.
/// \returns An unsigned 64-bit integer containing the result of clearing the
/// bits from the source operand.
/// \see _blsi_u64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsi_u64(unsigned long long __X)
{
return __X & -__X;
}
/// Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _blsi_u64(unsigned long long __X);
/// \endcode
///
/// This intrinsic corresponds to the \c BLSI instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose bits are to be cleared.
/// \returns An unsigned 64-bit integer containing the result of clearing the
/// bits from the source operand.
/// \see __blsi_u64
#define _blsi_u64 __blsi_u64
/// Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BLSMSK instruction.
///
/// \param __X
/// An unsigned 64-bit integer used to create the mask.
/// \returns An unsigned 64-bit integer containing the newly created mask.
/// \see _blsmsk_u64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsmsk_u64(unsigned long long __X)
{
return __X ^ (__X - 1);
}
/// Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _blsmsk_u64(unsigned long long __X);
/// \endcode
///
/// This intrinsic corresponds to the \c BLSMSK instruction.
///
/// \param __X
/// An unsigned 64-bit integer used to create the mask.
/// \returns An unsigned 64-bit integer containing the newly created mask.
/// \see __blsmsk_u64
#define _blsmsk_u64 __blsmsk_u64
/// Clears the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BLSR instruction.
///
/// \param __X
/// An unsigned 64-bit integer containing the operand to be cleared.
/// \returns An unsigned 64-bit integer containing the result of clearing the
/// source operand.
/// \see _blsr_u64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsr_u64(unsigned long long __X)
{
return __X & (__X - 1);
}
/// Clears the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _blsr_u64(unsigned long long __X);
/// \endcode
///
/// This intrinsic corresponds to the \c BLSR instruction.
///
/// \param __X
/// An unsigned 64-bit integer containing the operand to be cleared.
/// \returns An unsigned 64-bit integer containing the result of clearing the
/// source operand.
/// \see __blsr_u64
#define _blsr_u64 __blsr_u64
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__BMI__) */
#endif /* __BMIINTRIN_H */

115
third_party/intel/clang/cetintrin.h vendored Normal file
View file

@ -0,0 +1,115 @@
/*===---- cetintrin.h - CET intrinsic --------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __CETINTRIN_H
#define __CETINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("shstk")))
static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
__builtin_ia32_incsspd((unsigned int)__a);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
__builtin_ia32_incsspq(__a);
}
#endif /* __x86_64__ */
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
__builtin_ia32_incsspq(__a);
}
#else /* __x86_64__ */
static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
__builtin_ia32_incsspd(__a);
}
#endif /* __x86_64__ */
static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
return __builtin_ia32_rdsspd(__a);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32(void) {
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wuninitialized"
unsigned int t;
return __builtin_ia32_rdsspd(t);
#pragma clang diagnostic pop
}
#ifdef __x86_64__
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
return __builtin_ia32_rdsspq(__a);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64(void) {
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wuninitialized"
unsigned long long t;
return __builtin_ia32_rdsspq(t);
#pragma clang diagnostic pop
}
#endif /* __x86_64__ */
#ifdef __x86_64__
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) {
return __builtin_ia32_rdsspq(0);
}
#else /* __x86_64__ */
static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
return __builtin_ia32_rdsspd(0);
}
#endif /* __x86_64__ */
static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp(void) {
__builtin_ia32_saveprevssp();
}
static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
__builtin_ia32_rstorssp(__p);
}
static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
__builtin_ia32_wrssd(__a, __p);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
__builtin_ia32_wrssq(__a, __p);
}
#endif /* __x86_64__ */
static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
__builtin_ia32_wrussd(__a, __p);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
__builtin_ia32_wrussq(__a, __p);
}
#endif /* __x86_64__ */
static __inline__ void __DEFAULT_FN_ATTRS _setssbsy(void) {
__builtin_ia32_setssbsy();
}
static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
__builtin_ia32_clrssbsy(__p);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __CETINTRIN_H */

View file

@ -0,0 +1,36 @@
/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <cldemoteintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __CLDEMOTEINTRIN_H
#define __CLDEMOTEINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("cldemote")))
/// Hint to hardware that the cache line that contains \p __P should be demoted
/// from the cache closest to the processor core to a level more distant from
/// the processor core.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CLDEMOTE </c> instruction.
static __inline__ void __DEFAULT_FN_ATTRS
_cldemote(const void * __P) {
__builtin_ia32_cldemote(__P);
}
#define _mm_cldemote(p) _cldemote(p)
#undef __DEFAULT_FN_ATTRS
#endif

View file

@ -0,0 +1,36 @@
/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __CLFLUSHOPTINTRIN_H
#define __CLFLUSHOPTINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clflushopt")))
/// Invalidates all levels of the cache hierarchy and flushes modified data to
/// memory for the cache line specified by the address \a __m.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c CLFLUSHOPT instruction.
///
/// \param __m
/// An address within the cache line to flush and invalidate.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_clflushopt(void const * __m) {
__builtin_ia32_clflushopt(__m);
}
#undef __DEFAULT_FN_ATTRS
#endif

38
third_party/intel/clang/clwbintrin.h vendored Normal file
View file

@ -0,0 +1,38 @@
/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __CLWBINTRIN_H
#define __CLWBINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clwb")))
/// Writes back to memory the cache line (if modified) that contains the
/// linear address specified in \a __p from any level of the cache hierarchy in
/// the cache coherence domain
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> CLWB </c> instruction.
///
/// \param __p
/// A pointer to the memory location used to identify the cache line to be
/// written back.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_clwb(void const *__p) {
__builtin_ia32_clwb(__p);
}
#undef __DEFAULT_FN_ATTRS
#endif

38
third_party/intel/clang/clzerointrin.h vendored Normal file
View file

@ -0,0 +1,38 @@
/*===----------------------- clzerointrin.h - CLZERO ----------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __CLZEROINTRIN_H
#define __CLZEROINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("clzero")))
/// Zeroes out the cache line for the address \a __line. This uses a
/// non-temporal store. Calling \c _mm_sfence() afterward might be needed
/// to enforce ordering.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c CLZERO instruction.
///
/// \param __line
/// An address within the cache line to zero out.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_clzero (void * __line)
{
__builtin_ia32_clzero ((void *)__line);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __CLZEROINTRIN_H */

View file

@ -0,0 +1,70 @@
/*===--------------- cmpccxaddintrin.h - CMPCCXADD intrinsics--------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86GPRINTRIN_H
#error \
"Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
#endif // __X86GPRINTRIN_H
#ifndef __CMPCCXADDINTRIN_H
#define __CMPCCXADDINTRIN_H
#ifdef __x86_64__
typedef enum {
_CMPCCX_O, /* Overflow. */
_CMPCCX_NO, /* No overflow. */
_CMPCCX_B, /* Below. */
_CMPCCX_NB, /* Not below. */
_CMPCCX_Z, /* Zero. */
_CMPCCX_NZ, /* Not zero. */
_CMPCCX_BE, /* Below or equal. */
_CMPCCX_NBE, /* Neither below nor equal. */
_CMPCCX_S, /* Sign. */
_CMPCCX_NS, /* No sign. */
_CMPCCX_P, /* Parity. */
_CMPCCX_NP, /* No parity. */
_CMPCCX_L, /* Less. */
_CMPCCX_NL, /* Not less. */
_CMPCCX_LE, /* Less or equal. */
_CMPCCX_NLE, /* Neither less nor equal. */
} _CMPCCX_ENUM;
/// Compares the value from the memory __A with the value of __B. If the
/// specified condition __D is met, then add the third operand __C to the
/// __A and write it into __A, else the value of __A is unchanged. The return
/// value is the original value of __A.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c CMPCCXADD instructions.
///
/// \param __A
/// __A pointer specifying the memory address.
///
/// \param __B
/// A integer operand.
///
/// \param __C
/// A integer operand.
///
/// \param __D
/// The specified condition.
///
/// \returns a integer which is the original value of first operand.
#define _cmpccxadd_epi32(__A, __B, __C, __D) \
((int)(__builtin_ia32_cmpccxadd32((void *)(__A), (int)(__B), (int)(__C), \
(int)(__D))))
#define _cmpccxadd_epi64(__A, __B, __C, __D) \
((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B), \
(long long)(__C), (int)(__D))))
#endif // __x86_64__
#endif // __CMPCCXADDINTRIN_H

100
third_party/intel/clang/crc32intrin.h vendored Normal file
View file

@ -0,0 +1,100 @@
/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CRC32INTRIN_H
#define __CRC32INTRIN_H
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("crc32")))
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned char operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_crc32_u8(unsigned int __C, unsigned char __D)
{
return __builtin_ia32_crc32qi(__C, __D);
}
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned short operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_crc32_u16(unsigned int __C, unsigned short __D)
{
return __builtin_ia32_crc32hi(__C, __D);
}
/// Adds the first unsigned integer operand to the CRC-32C checksum of
/// the second unsigned integer operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_crc32_u32(unsigned int __C, unsigned int __D)
{
return __builtin_ia32_crc32si(__C, __D);
}
#ifdef __x86_64__
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned 64-bit integer operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
{
return __builtin_ia32_crc32di(__C, __D);
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#endif /* __CRC32INTRIN_H */

4906
third_party/intel/clang/emmintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

63
third_party/intel/clang/enqcmdintrin.h vendored Normal file
View file

@ -0,0 +1,63 @@
/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <enqcmdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __ENQCMDINTRIN_H
#define __ENQCMDINTRIN_H
/* Define the default attributes for the functions in this file */
#define _DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("enqcmd")))
/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
/// data, and performs 64-byte enqueue store to memory pointed by \a __dst.
/// This intrinsics may only be used in User mode.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsics corresponds to the <c> ENQCMD </c> instruction.
///
/// \param __dst
/// Pointer to the destination of the enqueue store.
/// \param __src
/// Pointer to 64-byte command data.
/// \returns If the command data is successfully written to \a __dst then 0 is
/// returned. Otherwise 1 is returned.
static __inline__ int _DEFAULT_FN_ATTRS
_enqcmd (void *__dst, const void *__src)
{
return __builtin_ia32_enqcmd(__dst, __src);
}
/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
/// data, and performs 64-byte enqueue store to memory pointed by \a __dst
/// This intrinsic may only be used in Privileged mode.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsics corresponds to the <c> ENQCMDS </c> instruction.
///
/// \param __dst
/// Pointer to the destination of the enqueue store.
/// \param __src
/// Pointer to 64-byte command data.
/// \returns If the command data is successfully written to \a __dst then 0 is
/// returned. Otherwise 1 is returned.
static __inline__ int _DEFAULT_FN_ATTRS
_enqcmds (void *__dst, const void *__src)
{
return __builtin_ia32_enqcmds(__dst, __src);
}
#undef _DEFAULT_FN_ATTRS
#endif /* __ENQCMDINTRIN_H */

162
third_party/intel/clang/f16cintrin.h vendored Normal file
View file

@ -0,0 +1,162 @@
/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __IMMINTRIN_H
#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __F16CINTRIN_H
#define __F16CINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
* but that's because icc can emulate these without f16c using a library call.
* Since we don't do that let's leave these in f16cintrin.h.
*/
/// Converts a 16-bit half-precision float value into a 32-bit float
/// value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 16-bit half-precision float value.
/// \returns The converted 32-bit float value.
static __inline float __DEFAULT_FN_ATTRS128
_cvtsh_ss(unsigned short __a)
{
__v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
__v4sf __r = __builtin_ia32_vcvtph2ps(__v);
return __r[0];
}
/// Converts a 32-bit single-precision float value to a 16-bit
/// half-precision float value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned short _cvtss_sh(float a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 32-bit single-precision float value to be converted to a 16-bit
/// half-precision float value.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns The converted 16-bit half-precision float value.
#define _cvtss_sh(a, imm) __extension__ ({ \
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
(imm)))[0]); })
/// Converts a 128-bit vector containing 32-bit float values into a
/// 128-bit vector containing 16-bit half-precision float values.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 128-bit vector containing 32-bit float values.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns A 128-bit vector containing converted 16-bit half-precision float
/// values. The lower 64 bits are used to store the converted 16-bit
/// half-precision floating-point values.
#define _mm_cvtps_ph(a, imm) \
((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
/// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 128-bit vector containing 32-bit float values.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector containing 16-bit half-precision float values. The lower
/// 64 bits are used in the conversion.
/// \returns A 128-bit vector of [4 x float] containing converted float values.
static __inline __m128 __DEFAULT_FN_ATTRS128
_mm_cvtph_ps(__m128i __a)
{
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
}
/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
/// containing 16-bit half-precision float values.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 256-bit vector containing 32-bit single-precision float values to be
/// converted to 16-bit half-precision float values.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns A 128-bit vector containing the converted 16-bit half-precision
/// float values.
#define _mm256_cvtps_ph(a, imm) \
((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
/// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 256-bit vector of [8 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector containing 16-bit half-precision float values to be
/// converted to 32-bit single-precision float values.
/// \returns A vector of [8 x float] containing the converted 32-bit
/// single-precision float values.
static __inline __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtph_ps(__m128i __a)
{
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif /* __F16CINTRIN_H */

218
third_party/intel/clang/fma4intrin.h vendored Normal file
View file

@ -0,0 +1,218 @@
/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __FMA4INTRIN_H
#define __FMA4INTRIN_H
#include "pmmintrin.h"
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif /* __FMA4INTRIN_H */

796
third_party/intel/clang/fmaintrin.h vendored Normal file
View file

@ -0,0 +1,796 @@
/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __FMAINTRIN_H
#define __FMAINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
/// Computes a multiply-add of 128-bit vectors of [4 x float].
/// For each element, computes <c> (__A * __B) + __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADD213PS instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [4 x float] containing the multiplier.
/// \param __C
/// A 128-bit vector of [4 x float] containing the addend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
/// Computes a multiply-add of 128-bit vectors of [2 x double].
/// For each element, computes <c> (__A * __B) + __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADD213PD instruction.
///
/// \param __A
/// A 128-bit vector of [2 x double] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [2 x double] containing the multiplier.
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit [2 x double] vector containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
/// Computes a scalar multiply-add of the single-precision values in the
/// low 32 bits of 128-bit vectors of [4 x float].
///
/// \code{.operation}
/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
/// result[127:32] = __A[127:32]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADD213SS instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float] containing the multiplicand in the low
/// 32 bits.
/// \param __B
/// A 128-bit vector of [4 x float] containing the multiplier in the low
/// 32 bits.
/// \param __C
/// A 128-bit vector of [4 x float] containing the addend in the low
/// 32 bits.
/// \returns A 128-bit vector of [4 x float] containing the result in the low
/// 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
/// Computes a scalar multiply-add of the double-precision values in the
/// low 64 bits of 128-bit vectors of [2 x double].
///
/// \code{.operation}
/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
/// result[127:64] = __A[127:64]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADD213SD instruction.
///
/// \param __A
/// A 128-bit vector of [2 x double] containing the multiplicand in the low
/// 64 bits.
/// \param __B
/// A 128-bit vector of [2 x double] containing the multiplier in the low
/// 64 bits.
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend in the low
/// 64 bits.
/// \returns A 128-bit vector of [2 x double] containing the result in the low
/// 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
/// For each element, computes <c> (__A * __B) - __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [4 x float] containing the multiplier.
/// \param __C
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
/// Computes a multiply-subtract of 128-bit vectors of [2 x double].
/// For each element, computes <c> (__A * __B) - __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
///
/// \param __A
/// A 128-bit vector of [2 x double] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [2 x double] containing the multiplier.
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
/// Computes a scalar multiply-subtract of the single-precision values in
/// the low 32 bits of 128-bit vectors of [4 x float].
///
/// \code{.operation}
/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
/// result[127:32] = __A[127:32]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMSUB213SS instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float] containing the multiplicand in the low
/// 32 bits.
/// \param __B
/// A 128-bit vector of [4 x float] containing the multiplier in the low
/// 32 bits.
/// \param __C
/// A 128-bit vector of [4 x float] containing the subtrahend in the low
/// 32 bits.
/// \returns A 128-bit vector of [4 x float] containing the result in the low
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
/// Computes a scalar multiply-subtract of the double-precision values in
/// the low 64 bits of 128-bit vectors of [2 x double].
///
/// \code{.operation}
/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
/// result[127:64] = __A[127:64]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMSUB213SD instruction.
///
/// \param __A
/// A 128-bit vector of [2 x double] containing the multiplicand in the low
/// 64 bits.
/// \param __B
/// A 128-bit vector of [2 x double] containing the multiplier in the low
/// 64 bits.
/// \param __C
/// A 128-bit vector of [2 x double] containing the subtrahend in the low
/// 64 bits.
/// \returns A 128-bit vector of [2 x double] containing the result in the low
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
/// For each element, computes <c> -(__A * __B) + __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [4 x float] containing the multiplier.
/// \param __C
/// A 128-bit vector of [4 x float] containing the addend.
/// \returns A 128-bit [4 x float] vector containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
/// Computes a negated multiply-add of 128-bit vectors of [2 x double].
/// For each element, computes <c> -(__A * __B) + __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
///
/// \param __A
/// A 128-bit vector of [2 x double] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [2 x double] containing the multiplier.
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
/// Computes a scalar negated multiply-add of the single-precision values in
/// the low 32 bits of 128-bit vectors of [4 x float].
///
/// \code{.operation}
/// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
/// result[127:32] = __A[127:32]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMADD213SS instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float] containing the multiplicand in the low
/// 32 bits.
/// \param __B
/// A 128-bit vector of [4 x float] containing the multiplier in the low
/// 32 bits.
/// \param __C
/// A 128-bit vector of [4 x float] containing the addend in the low
/// 32 bits.
/// \returns A 128-bit vector of [4 x float] containing the result in the low
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
}
/// Computes a scalar negated multiply-add of the double-precision values
/// in the low 64 bits of 128-bit vectors of [2 x double].
///
/// \code{.operation}
/// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
/// result[127:64] = __A[127:64]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMADD213SD instruction.
///
/// \param __A
/// A 128-bit vector of [2 x double] containing the multiplicand in the low
/// 64 bits.
/// \param __B
/// A 128-bit vector of [2 x double] containing the multiplier in the low
/// 64 bits.
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend in the low
/// 64 bits.
/// \returns A 128-bit vector of [2 x double] containing the result in the low
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
}
/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
/// For each element, computes <c> -(__A * __B) - __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [4 x float] containing the multiplier.
/// \param __C
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
/// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
/// For each element, computes <c> -(__A * __B) - __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
///
/// \param __A
/// A 128-bit vector of [2 x double] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [2 x double] containing the multiplier.
/// \param __C
/// A 128-bit vector of [2 x double] containing the subtrahend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
/// Computes a scalar negated multiply-subtract of the single-precision
/// values in the low 32 bits of 128-bit vectors of [4 x float].
///
/// \code{.operation}
/// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
/// result[127:32] = __A[127:32]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float] containing the multiplicand in the low
/// 32 bits.
/// \param __B
/// A 128-bit vector of [4 x float] containing the multiplier in the low
/// 32 bits.
/// \param __C
/// A 128-bit vector of [4 x float] containing the subtrahend in the low
/// 32 bits.
/// \returns A 128-bit vector of [4 x float] containing the result in the low
/// 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
}
/// Computes a scalar negated multiply-subtract of the double-precision
/// values in the low 64 bits of 128-bit vectors of [2 x double].
///
/// \code{.operation}
/// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
/// result[127:64] = __A[127:64]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
///
/// \param __A
/// A 128-bit vector of [2 x double] containing the multiplicand in the low
/// 64 bits.
/// \param __B
/// A 128-bit vector of [2 x double] containing the multiplier in the low
/// 64 bits.
/// \param __C
/// A 128-bit vector of [2 x double] containing the subtrahend in the low
/// 64 bits.
/// \returns A 128-bit vector of [2 x double] containing the result in the low
/// 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
}
/// Computes a multiply with alternating add/subtract of 128-bit vectors of
/// [4 x float].
///
/// \code{.operation}
/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [4 x float] containing the multiplier.
/// \param __C
/// A 128-bit vector of [4 x float] containing the addend/subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
/// Computes a multiply with alternating add/subtract of 128-bit vectors of
/// [2 x double].
///
/// \code{.operation}
/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
///
/// \param __A
/// A 128-bit vector of [2 x double] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [2 x double] containing the multiplier.
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend/subtrahend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
/// Computes a multiply with alternating add/subtract of 128-bit vectors of
/// [4 x float].
///
/// \code{.operation}
/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
/// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
///
/// \param __A
/// A 128-bit vector of [4 x float] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [4 x float] containing the multiplier.
/// \param __C
/// A 128-bit vector of [4 x float] containing the addend/subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the result.
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
/// Computes a multiply with alternating add/subtract of 128-bit vectors of
/// [2 x double].
///
/// \code{.operation}
/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
///
/// \param __A
/// A 128-bit vector of [2 x double] containing the multiplicand.
/// \param __B
/// A 128-bit vector of [2 x double] containing the multiplier.
/// \param __C
/// A 128-bit vector of [2 x double] containing the addend/subtrahend.
/// \returns A 128-bit vector of [2 x double] containing the result.
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
/// Computes a multiply-add of 256-bit vectors of [8 x float].
/// For each element, computes <c> (__A * __B) + __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADD213PS instruction.
///
/// \param __A
/// A 256-bit vector of [8 x float] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [8 x float] containing the multiplier.
/// \param __C
/// A 256-bit vector of [8 x float] containing the addend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
/// Computes a multiply-add of 256-bit vectors of [4 x double].
/// For each element, computes <c> (__A * __B) + __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADD213PD instruction.
///
/// \param __A
/// A 256-bit vector of [4 x double] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [4 x double] containing the multiplier.
/// \param __C
/// A 256-bit vector of [4 x double] containing the addend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
/// Computes a multiply-subtract of 256-bit vectors of [8 x float].
/// For each element, computes <c> (__A * __B) - __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
///
/// \param __A
/// A 256-bit vector of [8 x float] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [8 x float] containing the multiplier.
/// \param __C
/// A 256-bit vector of [8 x float] containing the subtrahend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
/// Computes a multiply-subtract of 256-bit vectors of [4 x double].
/// For each element, computes <c> (__A * __B) - __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
///
/// \param __A
/// A 256-bit vector of [4 x double] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [4 x double] containing the multiplier.
/// \param __C
/// A 256-bit vector of [4 x double] containing the subtrahend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
/// Computes a negated multiply-add of 256-bit vectors of [8 x float].
/// For each element, computes <c> -(__A * __B) + __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMADD213PS instruction.
///
/// \param __A
/// A 256-bit vector of [8 x float] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [8 x float] containing the multiplier.
/// \param __C
/// A 256-bit vector of [8 x float] containing the addend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
/// Computes a negated multiply-add of 256-bit vectors of [4 x double].
/// For each element, computes <c> -(__A * __B) + __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
///
/// \param __A
/// A 256-bit vector of [4 x double] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [4 x double] containing the multiplier.
/// \param __C
/// A 256-bit vector of [4 x double] containing the addend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
}
/// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
/// For each element, computes <c> -(__A * __B) - __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
///
/// \param __A
/// A 256-bit vector of [8 x float] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [8 x float] containing the multiplier.
/// \param __C
/// A 256-bit vector of [8 x float] containing the subtrahend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
/// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
/// For each element, computes <c> -(__A * __B) - __C </c>.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
///
/// \param __A
/// A 256-bit vector of [4 x double] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [4 x double] containing the multiplier.
/// \param __C
/// A 256-bit vector of [4 x double] containing the subtrahend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
/// Computes a multiply with alternating add/subtract of 256-bit vectors of
/// [8 x float].
///
/// \code{.operation}
/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
/// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
/// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
/// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
/// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
///
/// \param __A
/// A 256-bit vector of [8 x float] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [8 x float] containing the multiplier.
/// \param __C
/// A 256-bit vector of [8 x float] containing the addend/subtrahend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
/// Computes a multiply with alternating add/subtract of 256-bit vectors of
/// [4 x double].
///
/// \code{.operation}
/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
/// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
/// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
///
/// \param __A
/// A 256-bit vector of [4 x double] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [4 x double] containing the multiplier.
/// \param __C
/// A 256-bit vector of [4 x double] containing the addend/subtrahend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
/// Computes a vector multiply with alternating add/subtract of 256-bit
/// vectors of [8 x float].
///
/// \code{.operation}
/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
/// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
/// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
/// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
/// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
/// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
///
/// \param __A
/// A 256-bit vector of [8 x float] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [8 x float] containing the multiplier.
/// \param __C
/// A 256-bit vector of [8 x float] containing the addend/subtrahend.
/// \returns A 256-bit vector of [8 x float] containing the result.
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
/// Computes a vector multiply with alternating add/subtract of 256-bit
/// vectors of [4 x double].
///
/// \code{.operation}
/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
/// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
/// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
///
/// \param __A
/// A 256-bit vector of [4 x double] containing the multiplicand.
/// \param __B
/// A 256-bit vector of [4 x double] containing the multiplier.
/// \param __C
/// A 256-bit vector of [4 x double] containing the addend/subtrahend.
/// \returns A 256-bit vector of [4 x double] containing the result.
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif /* __FMAINTRIN_H */

91
third_party/intel/clang/fxsrintrin.h vendored Normal file
View file

@ -0,0 +1,91 @@
/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __FXSRINTRIN_H
#define __FXSRINTRIN_H
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr")))
/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
/// memory region pointed to by the input parameter \a __p.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
///
/// \param __p
/// A pointer to a 512-byte memory region. The beginning of this memory
/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
_fxsave(void *__p)
{
__builtin_ia32_fxsave(__p);
}
/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
/// memory region pointed to by the input parameter \a __p. The contents of
/// this memory region should have been written to by a previous \c _fxsave
/// or \c _fxsave64 intrinsic.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
///
/// \param __p
/// A pointer to a 512-byte memory region. The beginning of this memory
/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
_fxrstor(void *__p)
{
__builtin_ia32_fxrstor(__p);
}
#ifdef __x86_64__
/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
/// memory region pointed to by the input parameter \a __p.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
///
/// \param __p
/// A pointer to a 512-byte memory region. The beginning of this memory
/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
_fxsave64(void *__p)
{
__builtin_ia32_fxsave64(__p);
}
/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
/// memory region pointed to by the input parameter \a __p. The contents of
/// this memory region should have been written to by a previous \c _fxsave
/// or \c _fxsave64 intrinsic.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
///
/// \param __p
/// A pointer to a 512-byte memory region. The beginning of this memory
/// region should be aligned on a 16-byte boundary.
static __inline__ void __DEFAULT_FN_ATTRS
_fxrstor64(void *__p)
{
__builtin_ia32_fxrstor64(__p);
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif

211
third_party/intel/clang/gfniintrin.h vendored Normal file
View file

@ -0,0 +1,211 @@
/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __GFNIINTRIN_H
#define __GFNIINTRIN_H
/* Default attributes for simple form (no masking). */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("gfni,no-evex512"), __min_vector_width__(128)))
/* Default attributes for YMM unmasked form. */
#define __DEFAULT_FN_ATTRS_Y \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx,gfni,no-evex512"), \
__min_vector_width__(256)))
/* Default attributes for ZMM unmasked forms. */
#define __DEFAULT_FN_ATTRS_Z \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512f,evex512,gfni"), \
__min_vector_width__(512)))
/* Default attributes for ZMM masked forms. */
#define __DEFAULT_FN_ATTRS_Z_MASK \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512bw,evex512,gfni"), \
__min_vector_width__(512)))
/* Default attributes for VLX masked forms. */
#define __DEFAULT_FN_ATTRS_VL128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512bw,avx512vl,gfni,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS_VL256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512bw,avx512vl,gfni,no-evex512"), \
__min_vector_width__(256)))
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), \
(char)(I)))
#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), \
(char)(I)))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
(__v16qi) __B);
}
#ifdef __AVXINTRIN_H
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), \
(char)(I)))
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), \
(char)(I)))
static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
(__v32qi) __B);
}
#endif /* __AVXINTRIN_H */
#ifdef __AVX512BWINTRIN_H
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), \
(char)(I)))
#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v64qi)(__m512i)(S)))
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
U, A, B, I)
#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), \
(char)(I)))
#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
(__v64qi)(__m512i)(S)))
#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
U, A, B, I)
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
(__v64qi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_selectb_512(__U,
(__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
(__v64qi) __S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
{
return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
__U, __A, __B);
}
#endif /* __AVX512BWINTRIN_H */
#ifdef __AVX512VLBWINTRIN_H
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v16qi)(__m128i)(S)))
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
U, A, B, I)
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v32qi)(__m256i)(S)))
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
U, A, B, I)
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
(__v16qi)(__m128i)(S)))
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
(__v32qi)(__m256i)(S)))
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
U, A, B, I)
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_selectb_128(__U,
(__v16qi) _mm_gf2p8mul_epi8(__A, __B),
(__v16qi) __S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
{
return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
__U, __A, __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_selectb_256(__U,
(__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
(__v32qi) __S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
{
return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
__U, __A, __B);
}
#endif /* __AVX512VLBWINTRIN_H */
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_Y
#undef __DEFAULT_FN_ATTRS_Z
#undef __DEFAULT_FN_ATTRS_VL128
#undef __DEFAULT_FN_ATTRS_VL256
#endif /* __GFNIINTRIN_H */

49
third_party/intel/clang/hresetintrin.h vendored Normal file
View file

@ -0,0 +1,49 @@
/*===---------------- hresetintrin.h - HRESET intrinsics -------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86GPRINTRIN_H
#error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead."
#endif
#ifndef __HRESETINTRIN_H
#define __HRESETINTRIN_H
#if __has_extension(gnu_asm)
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("hreset")))
/// Provides a hint to the processor to selectively reset the prediction
/// history of the current logical processor specified by a 32-bit integer
/// value \a __eax.
///
/// This intrinsic corresponds to the <c> HRESET </c> instruction.
///
/// \code{.operation}
/// IF __eax == 0
/// // nop
/// ELSE
/// FOR i := 0 to 31
/// IF __eax[i]
/// ResetPredictionFeature(i)
/// FI
/// ENDFOR
/// FI
/// \endcode
static __inline void __DEFAULT_FN_ATTRS
_hreset(int __eax)
{
__asm__ ("hreset $0" :: "a"(__eax));
}
#undef __DEFAULT_FN_ATTRS
#endif /* __has_extension(gnu_asm) */
#endif /* __HRESETINTRIN_H */

863
third_party/intel/clang/ia32intrin.h vendored Normal file
View file

@ -0,0 +1,863 @@
/* ===-------- ia32intrin.h ---------------------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __IA32INTRIN_H
#define __IA32INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS_CRC32 __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
#if defined(__cplusplus) && (__cplusplus >= 201103L)
#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
#else
#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__))
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
#endif
/// Finds the first set bit starting from the least significant bit. The result
/// is undefined if the input is 0.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BSF instruction or the
/// \c TZCNT instruction.
///
/// \param __A
/// A 32-bit integer operand.
/// \returns A 32-bit integer containing the bit number.
/// \see _bit_scan_forward
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
__bsfd(int __A) {
return __builtin_ctz((unsigned int)__A);
}
/// Finds the first set bit starting from the most significant bit. The result
/// is undefined if the input is 0.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BSR instruction or the
/// \c LZCNT instruction and an \c XOR.
///
/// \param __A
/// A 32-bit integer operand.
/// \returns A 32-bit integer containing the bit number.
/// \see _bit_scan_reverse
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
__bsrd(int __A) {
return 31 - __builtin_clz((unsigned int)__A);
}
/// Swaps the bytes in the input, converting little endian to big endian or
/// vice versa.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BSWAP instruction.
///
/// \param __A
/// A 32-bit integer operand.
/// \returns A 32-bit integer containing the swapped bytes.
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
__bswapd(int __A) {
return (int)__builtin_bswap32((unsigned int)__A);
}
/// Swaps the bytes in the input, converting little endian to big endian or
/// vice versa.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BSWAP instruction.
///
/// \param __A
/// A 32-bit integer operand.
/// \returns A 32-bit integer containing the swapped bytes.
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
_bswap(int __A) {
return (int)__builtin_bswap32((unsigned int)__A);
}
/// Finds the first set bit starting from the least significant bit. The result
/// is undefined if the input is 0.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// int _bit_scan_forward(int A);
/// \endcode
///
/// This intrinsic corresponds to the \c BSF instruction or the
/// \c TZCNT instruction.
///
/// \param A
/// A 32-bit integer operand.
/// \returns A 32-bit integer containing the bit number.
/// \see __bsfd
#define _bit_scan_forward(A) __bsfd((A))
/// Finds the first set bit starting from the most significant bit. The result
/// is undefined if the input is 0.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// int _bit_scan_reverse(int A);
/// \endcode
///
/// This intrinsic corresponds to the \c BSR instruction or the
/// \c LZCNT instruction and an \c XOR.
///
/// \param A
/// A 32-bit integer operand.
/// \returns A 32-bit integer containing the bit number.
/// \see __bsrd
#define _bit_scan_reverse(A) __bsrd((A))
#ifdef __x86_64__
/// Finds the first set bit starting from the least significant bit. The result
/// is undefined if the input is 0.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BSF instruction or the
/// \c TZCNT instruction.
///
/// \param __A
/// A 64-bit integer operand.
/// \returns A 32-bit integer containing the bit number.
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
__bsfq(long long __A) {
return (long long)__builtin_ctzll((unsigned long long)__A);
}
/// Finds the first set bit starting from the most significant bit. The result
/// is undefined if input is 0.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BSR instruction or the
/// \c LZCNT instruction and an \c XOR.
///
/// \param __A
/// A 64-bit integer operand.
/// \returns A 32-bit integer containing the bit number.
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
__bsrq(long long __A) {
return 63 - __builtin_clzll((unsigned long long)__A);
}
/// Swaps the bytes in the input, converting little endian to big endian or
/// vice versa.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c BSWAP instruction.
///
/// \param __A
/// A 64-bit integer operand.
/// \returns A 64-bit integer containing the swapped bytes.
/// \see _bswap64
static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
__bswapq(long long __A) {
return (long long)__builtin_bswap64((unsigned long long)__A);
}
/// Swaps the bytes in the input, converting little endian to big endian or
/// vice versa.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// long long _bswap64(long long A);
/// \endcode
///
/// This intrinsic corresponds to the \c BSWAP instruction.
///
/// \param A
/// A 64-bit integer operand.
/// \returns A 64-bit integer containing the swapped bytes.
/// \see __bswapq
#define _bswap64(A) __bswapq((A))
#endif /* __x86_64__ */
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c POPCNT instruction or a
/// sequence of arithmetic and logic operations to calculate it.
///
/// \param __A
/// An unsigned 32-bit integer operand.
/// \returns A 32-bit integer containing the number of bits with value 1 in the
/// source operand.
/// \see _popcnt32
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
__popcntd(unsigned int __A)
{
return __builtin_popcount(__A);
}
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// int _popcnt32(int A);
/// \endcode
///
/// This intrinsic corresponds to the \c POPCNT instruction or a
/// sequence of arithmetic and logic operations to calculate it.
///
/// \param A
/// An unsigned 32-bit integer operand.
/// \returns A 32-bit integer containing the number of bits with value 1 in the
/// source operand.
/// \see __popcntd
#define _popcnt32(A) __popcntd((A))
#ifdef __x86_64__
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c POPCNT instruction or a
/// sequence of arithmetic and logic operations to calculate it.
///
/// \param __A
/// An unsigned 64-bit integer operand.
/// \returns A 64-bit integer containing the number of bits with value 1 in the
/// source operand.
/// \see _popcnt64
static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
__popcntq(unsigned long long __A)
{
return __builtin_popcountll(__A);
}
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// long long _popcnt64(unsigned long long A);
/// \endcode
///
/// This intrinsic corresponds to the \c POPCNT instruction or a
/// sequence of arithmetic and logic operations to calculate it.
///
/// \param A
/// An unsigned 64-bit integer operand.
/// \returns A 64-bit integer containing the number of bits with value 1 in the
/// source operand.
/// \see __popcntq
#define _popcnt64(A) __popcntq((A))
#endif /* __x86_64__ */
#ifdef __x86_64__
/// Returns the program status-and-control \c RFLAGS register with the \c VM
/// and \c RF flags cleared.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PUSHFQ + \c POP instruction sequence.
///
/// \returns The 64-bit value of the RFLAGS register.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__readeflags(void)
{
return __builtin_ia32_readeflags_u64();
}
/// Writes the specified value to the program status-and-control \c RFLAGS
/// register. Reserved bits are not affected.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PUSH + \c POPFQ instruction sequence.
///
/// \param __f
/// The 64-bit value to write to \c RFLAGS.
static __inline__ void __DEFAULT_FN_ATTRS
__writeeflags(unsigned long long __f)
{
__builtin_ia32_writeeflags_u64(__f);
}
#else /* !__x86_64__ */
/// Returns the program status-and-control \c EFLAGS register with the \c VM
/// and \c RF flags cleared.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PUSHFD + \c POP instruction sequence.
///
/// \returns The 32-bit value of the EFLAGS register.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__readeflags(void)
{
return __builtin_ia32_readeflags_u32();
}
/// Writes the specified value to the program status-and-control \c EFLAGS
/// register. Reserved bits are not affected.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PUSH + \c POPFD instruction sequence.
///
/// \param __f
/// The 32-bit value to write to \c EFLAGS.
static __inline__ void __DEFAULT_FN_ATTRS
__writeeflags(unsigned int __f)
{
__builtin_ia32_writeeflags_u32(__f);
}
#endif /* !__x86_64__ */
/// Casts a 32-bit float value to a 32-bit unsigned integer value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VMOVD / \c MOVD instruction in x86_64,
/// and corresponds to the \c VMOVL / \c MOVL instruction in ia32.
///
/// \param __A
/// A 32-bit float value.
/// \returns A 32-bit unsigned integer containing the converted value.
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
_castf32_u32(float __A) {
return __builtin_bit_cast(unsigned int, __A);
}
/// Casts a 64-bit float value to a 64-bit unsigned integer value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VMOVQ / \c MOVQ instruction in x86_64,
/// and corresponds to the \c VMOVL / \c MOVL instruction in ia32.
///
/// \param __A
/// A 64-bit float value.
/// \returns A 64-bit unsigned integer containing the converted value.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
_castf64_u64(double __A) {
return __builtin_bit_cast(unsigned long long, __A);
}
/// Casts a 32-bit unsigned integer value to a 32-bit float value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VMOVQ / \c MOVQ instruction in x86_64,
/// and corresponds to the \c FLDS instruction in ia32.
///
/// \param __A
/// A 32-bit unsigned integer value.
/// \returns A 32-bit float value containing the converted value.
static __inline__ float __DEFAULT_FN_ATTRS_CAST
_castu32_f32(unsigned int __A) {
return __builtin_bit_cast(float, __A);
}
/// Casts a 64-bit unsigned integer value to a 64-bit float value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VMOVQ / \c MOVQ instruction in x86_64,
/// and corresponds to the \c FLDL instruction in ia32.
///
/// \param __A
/// A 64-bit unsigned integer value.
/// \returns A 64-bit float value containing the converted value.
static __inline__ double __DEFAULT_FN_ATTRS_CAST
_castu64_f64(unsigned long long __A) {
return __builtin_bit_cast(double, __A);
}
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned char operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c CRC32B instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
__crc32b(unsigned int __C, unsigned char __D)
{
return __builtin_ia32_crc32qi(__C, __D);
}
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned short operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c CRC32W instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
__crc32w(unsigned int __C, unsigned short __D)
{
return __builtin_ia32_crc32hi(__C, __D);
}
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// second unsigned integer operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c CRC32D instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
__crc32d(unsigned int __C, unsigned int __D)
{
return __builtin_ia32_crc32si(__C, __D);
}
#ifdef __x86_64__
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned 64-bit integer operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c CRC32Q instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CRC32
__crc32q(unsigned long long __C, unsigned long long __D)
{
return __builtin_ia32_crc32di(__C, __D);
}
#endif /* __x86_64__ */
/// Reads the specified performance-monitoring counter. Refer to your
/// processor's documentation to determine which performance counters are
/// supported.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c RDPMC instruction.
///
/// \param __A
/// The performance counter to read.
/// \returns The 64-bit value read from the performance counter.
/// \see _rdpmc
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__rdpmc(int __A) {
return __builtin_ia32_rdpmc(__A);
}
/// Reads the processor's time-stamp counter and the \c IA32_TSC_AUX MSR
/// \c (0xc0000103).
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c RDTSCP instruction.
///
/// \param __A
/// The address of where to store the 32-bit \c IA32_TSC_AUX value.
/// \returns The 64-bit value of the time-stamp counter.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__rdtscp(unsigned int *__A) {
return __builtin_ia32_rdtscp(__A);
}
/// Reads the processor's time-stamp counter.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _rdtsc();
/// \endcode
///
/// This intrinsic corresponds to the \c RDTSC instruction.
///
/// \returns The 64-bit value of the time-stamp counter.
#define _rdtsc() __rdtsc()
/// Reads the specified performance monitoring counter. Refer to your
/// processor's documentation to determine which performance counters are
/// supported.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _rdpmc(int A);
/// \endcode
///
/// This intrinsic corresponds to the \c RDPMC instruction.
///
/// \param A
/// The performance counter to read.
/// \returns The 64-bit value read from the performance counter.
/// \see __rdpmc
#define _rdpmc(A) __rdpmc(A)
static __inline__ void __DEFAULT_FN_ATTRS
_wbinvd(void) {
__builtin_ia32_wbinvd();
}
/// Rotates an 8-bit value to the left by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c ROL instruction.
///
/// \param __X
/// The unsigned 8-bit value to be rotated.
/// \param __C
/// The number of bits to rotate the value.
/// \returns The rotated value.
static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
__rolb(unsigned char __X, int __C) {
return __builtin_rotateleft8(__X, __C);
}
/// Rotates an 8-bit value to the right by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c ROR instruction.
///
/// \param __X
/// The unsigned 8-bit value to be rotated.
/// \param __C
/// The number of bits to rotate the value.
/// \returns The rotated value.
static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
__rorb(unsigned char __X, int __C) {
return __builtin_rotateright8(__X, __C);
}
/// Rotates a 16-bit value to the left by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c ROL instruction.
///
/// \param __X
/// The unsigned 16-bit value to be rotated.
/// \param __C
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see _rotwl
static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
__rolw(unsigned short __X, int __C) {
return __builtin_rotateleft16(__X, __C);
}
/// Rotates a 16-bit value to the right by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c ROR instruction.
///
/// \param __X
/// The unsigned 16-bit value to be rotated.
/// \param __C
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see _rotwr
static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
__rorw(unsigned short __X, int __C) {
return __builtin_rotateright16(__X, __C);
}
/// Rotates a 32-bit value to the left by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c ROL instruction.
///
/// \param __X
/// The unsigned 32-bit value to be rotated.
/// \param __C
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see _rotl
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
__rold(unsigned int __X, int __C) {
return __builtin_rotateleft32(__X, (unsigned int)__C);
}
/// Rotates a 32-bit value to the right by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c ROR instruction.
///
/// \param __X
/// The unsigned 32-bit value to be rotated.
/// \param __C
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see _rotr
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
__rord(unsigned int __X, int __C) {
return __builtin_rotateright32(__X, (unsigned int)__C);
}
#ifdef __x86_64__
/// Rotates a 64-bit value to the left by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c ROL instruction.
///
/// \param __X
/// The unsigned 64-bit value to be rotated.
/// \param __C
/// The number of bits to rotate the value.
/// \returns The rotated value.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
__rolq(unsigned long long __X, int __C) {
return __builtin_rotateleft64(__X, (unsigned long long)__C);
}
/// Rotates a 64-bit value to the right by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c ROR instruction.
///
/// \param __X
/// The unsigned 64-bit value to be rotated.
/// \param __C
/// The number of bits to rotate the value.
/// \returns The rotated value.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
__rorq(unsigned long long __X, int __C) {
return __builtin_rotateright64(__X, (unsigned long long)__C);
}
#endif /* __x86_64__ */
#ifndef _MSC_VER
/* These are already provided as builtins for MSVC. */
/* Select the correct function based on the size of long. */
#ifdef __LP64__
/// Rotates a 64-bit value to the left by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _lrotl(unsigned long long a, int b);
/// \endcode
///
/// This intrinsic corresponds to the \c ROL instruction.
///
/// \param a
/// The unsigned 64-bit value to be rotated.
/// \param b
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see __rolq
#define _lrotl(a,b) __rolq((a), (b))
/// Rotates a 64-bit value to the right by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned long long _lrotr(unsigned long long a, int b);
/// \endcode
///
/// This intrinsic corresponds to the \c ROR instruction.
///
/// \param a
/// The unsigned 64-bit value to be rotated.
/// \param b
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see __rorq
#define _lrotr(a,b) __rorq((a), (b))
#else // __LP64__
/// Rotates a 32-bit value to the left by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _lrotl(unsigned int a, int b);
/// \endcode
///
/// This intrinsic corresponds to the \c ROL instruction.
///
/// \param a
/// The unsigned 32-bit value to be rotated.
/// \param b
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see __rold
#define _lrotl(a,b) __rold((a), (b))
/// Rotates a 32-bit value to the right by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _lrotr(unsigned int a, int b);
/// \endcode
///
/// This intrinsic corresponds to the \c ROR instruction.
///
/// \param a
/// The unsigned 32-bit value to be rotated.
/// \param b
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see __rord
#define _lrotr(a,b) __rord((a), (b))
#endif // __LP64__
/// Rotates a 32-bit value to the left by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _rotl(unsigned int a, int b);
/// \endcode
///
/// This intrinsic corresponds to the \c ROL instruction.
///
/// \param a
/// The unsigned 32-bit value to be rotated.
/// \param b
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see __rold
#define _rotl(a,b) __rold((a), (b))
/// Rotates a 32-bit value to the right by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned int _rotr(unsigned int a, int b);
/// \endcode
///
/// This intrinsic corresponds to the \c ROR instruction.
///
/// \param a
/// The unsigned 32-bit value to be rotated.
/// \param b
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see __rord
#define _rotr(a,b) __rord((a), (b))
#endif // _MSC_VER
/* These are not builtins so need to be provided in all modes. */
/// Rotates a 16-bit value to the left by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned short _rotwl(unsigned short a, int b);
/// \endcode
///
/// This intrinsic corresponds to the \c ROL instruction.
///
/// \param a
/// The unsigned 16-bit value to be rotated.
/// \param b
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see __rolw
#define _rotwl(a,b) __rolw((a), (b))
/// Rotates a 16-bit value to the right by the specified number of bits.
/// This operation is undefined if the number of bits exceeds the size of
/// the value.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// unsigned short _rotwr(unsigned short a, int b);
/// \endcode
///
/// This intrinsic corresponds to the \c ROR instruction.
///
/// \param a
/// The unsigned 16-bit value to be rotated.
/// \param b
/// The number of bits to rotate the value.
/// \returns The rotated value.
/// \see __rorw
#define _rotwr(a,b) __rorw((a), (b))
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_CAST
#undef __DEFAULT_FN_ATTRS_CRC32
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
#endif /* __IA32INTRIN_H */

747
third_party/intel/clang/immintrin.h vendored Normal file
View file

@ -0,0 +1,747 @@
/*===---- immintrin.h - Intel intrinsics -----------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#define __IMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include "x86gprintrin.h"
#if !defined(__SCE__) || __has_feature(modules) || defined(__MMX__)
#include "mmintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE__)
#include "xmmintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE2__)
#include "emmintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE3__)
#include "pmmintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SSSE3__)
#include "tmmintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__SSE4_2__) || defined(__SSE4_1__))
#include "smmintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AES__) || defined(__PCLMUL__))
#include "wmmintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__CLFLUSHOPT__)
#include "clflushoptintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__CLWB__)
#include "clwbintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX__)
#include "avxintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX2__)
#include "avx2intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__F16C__)
#include "f16cintrin.h"
#endif
/* No feature check desired due to internal checks */
#include "bmiintrin.h"
#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI2__)
#include "bmi2intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__LZCNT__)
#include "lzcntintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__POPCNT__)
#include "popcntintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA__)
#include "fmaintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512F__)
#include "avx512fintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VL__)
#include "avx512vlintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BW__)
#include "avx512bwintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BITALG__)
#include "avx512bitalgintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512CD__)
#include "avx512cdintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
#include "avx512vpopcntdqintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
#include "avx512vpopcntdqvlintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VNNI__)
#include "avx512vnniintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512VNNI__))
#include "avx512vlvnniintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNI__)
#include "avxvnniintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512DQ__)
#include "avx512dqintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512BITALG__))
#include "avx512vlbitalgintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512BW__))
#include "avx512vlbwintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512CD__))
#include "avx512vlcdintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512DQ__))
#include "avx512vldqintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
#include "avx512ifmaintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512IFMA__) && defined(__AVX512VL__))
#include "avx512ifmavlintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXIFMA__)
#include "avxifmaintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI__)
#include "avx512vbmiintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VBMI__) && defined(__AVX512VL__))
#include "avx512vbmivlintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI2__)
#include "avx512vbmi2intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VBMI2__) && defined(__AVX512VL__))
#include "avx512vlvbmi2intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
#include "avx512fp16intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512FP16__))
#include "avx512vlfp16intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BF16__)
#include "avx512bf16intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512BF16__))
#include "avx512vlbf16intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__PKU__)
#include "pkuintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__VPCLMULQDQ__)
#include "vpclmulqdqintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__VAES__)
#include "vaesintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__GFNI__)
#include "gfniintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT8__)
#include "avxvnniint8intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXNECONVERT__)
#include "avxneconvertintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA512__)
#include "sha512intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SM3__)
#include "sm3intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SM4__)
#include "sm4intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT16__)
#include "avxvnniint16intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPID__)
/// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> RDPID </c> instruction.
///
/// \returns The 32-bit contents of the MSR.
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
_rdpid_u32(void) {
return __builtin_ia32_rdpid();
}
#endif // __RDPID__
#if !defined(__SCE__) || __has_feature(modules) || defined(__RDRND__)
/// Returns a 16-bit hardware-generated random value.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
///
/// \param __p
/// A pointer to a 16-bit memory location to place the random value.
/// \returns 1 if the value was successfully generated, 0 otherwise.
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand16_step(unsigned short *__p)
{
return (int)__builtin_ia32_rdrand16_step(__p);
}
/// Returns a 32-bit hardware-generated random value.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
///
/// \param __p
/// A pointer to a 32-bit memory location to place the random value.
/// \returns 1 if the value was successfully generated, 0 otherwise.
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand32_step(unsigned int *__p)
{
return (int)__builtin_ia32_rdrand32_step(__p);
}
/// Returns a 64-bit hardware-generated random value.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
///
/// \param __p
/// A pointer to a 64-bit memory location to place the random value.
/// \returns 1 if the value was successfully generated, 0 otherwise.
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand64_step(unsigned long long *__p)
{
#ifdef __x86_64__
return (int)__builtin_ia32_rdrand64_step(__p);
#else
// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
// rdrand instructions.
unsigned int __lo, __hi;
unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
if (__res_lo && __res_hi) {
*__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
return 1;
} else {
*__p = 0;
return 0;
}
#endif
}
#endif /* __RDRND__ */
#if !defined(__SCE__) || __has_feature(modules) || defined(__FSGSBASE__)
#ifdef __x86_64__
/// Reads the FS base register.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
///
/// \returns The lower 32 bits of the FS base register.
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_readfsbase_u32(void)
{
return __builtin_ia32_rdfsbase32();
}
/// Reads the FS base register.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
///
/// \returns The contents of the FS base register.
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_readfsbase_u64(void)
{
return __builtin_ia32_rdfsbase64();
}
/// Reads the GS base register.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
///
/// \returns The lower 32 bits of the GS base register.
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_readgsbase_u32(void)
{
return __builtin_ia32_rdgsbase32();
}
/// Reads the GS base register.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
///
/// \returns The contents of the GS base register.
static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_readgsbase_u64(void)
{
return __builtin_ia32_rdgsbase64();
}
/// Modifies the FS base register.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
///
/// \param __V
/// Value to use for the lower 32 bits of the FS base register.
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writefsbase_u32(unsigned int __V)
{
__builtin_ia32_wrfsbase32(__V);
}
/// Modifies the FS base register.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
///
/// \param __V
/// Value to use for the FS base register.
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writefsbase_u64(unsigned long long __V)
{
__builtin_ia32_wrfsbase64(__V);
}
/// Modifies the GS base register.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> WRGSBASE </c> instruction.
///
/// \param __V
/// Value to use for the lower 32 bits of the GS base register.
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writegsbase_u32(unsigned int __V)
{
__builtin_ia32_wrgsbase32(__V);
}
/// Modifies the GS base register.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
///
/// \param __V
/// Value to use for GS base register.
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writegsbase_u64(unsigned long long __V)
{
__builtin_ia32_wrgsbase64(__V);
}
#endif
#endif /* __FSGSBASE__ */
#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVBE__)
/* The structs used below are to force the load/store to be unaligned. This
* is accomplished with the __packed__ attribute. The __may_alias__ prevents
* tbaa metadata from being generated based on the struct and the type of the
* field inside of it.
*/
/// Load a 16-bit value from memory and swap its bytes.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the MOVBE instruction.
///
/// \param __P
/// A pointer to the 16-bit value to load.
/// \returns The byte-swapped value.
static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
_loadbe_i16(void const * __P) {
struct __loadu_i16 {
unsigned short __v;
} __attribute__((__packed__, __may_alias__));
return (short)__builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
}
/// Swap the bytes of a 16-bit value and store it to memory.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the MOVBE instruction.
///
/// \param __P
/// A pointer to the memory for storing the swapped value.
/// \param __D
/// The 16-bit value to be byte-swapped.
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
_storebe_i16(void * __P, short __D) {
struct __storeu_i16 {
unsigned short __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_i16*)__P)->__v = __builtin_bswap16((unsigned short)__D);
}
/// Load a 32-bit value from memory and swap its bytes.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the MOVBE instruction.
///
/// \param __P
/// A pointer to the 32-bit value to load.
/// \returns The byte-swapped value.
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
_loadbe_i32(void const * __P) {
struct __loadu_i32 {
unsigned int __v;
} __attribute__((__packed__, __may_alias__));
return (int)__builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
}
/// Swap the bytes of a 32-bit value and store it to memory.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the MOVBE instruction.
///
/// \param __P
/// A pointer to the memory for storing the swapped value.
/// \param __D
/// The 32-bit value to be byte-swapped.
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
_storebe_i32(void * __P, int __D) {
struct __storeu_i32 {
unsigned int __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_i32*)__P)->__v = __builtin_bswap32((unsigned int)__D);
}
#ifdef __x86_64__
/// Load a 64-bit value from memory and swap its bytes.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the MOVBE instruction.
///
/// \param __P
/// A pointer to the 64-bit value to load.
/// \returns The byte-swapped value.
static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
_loadbe_i64(void const * __P) {
struct __loadu_i64 {
unsigned long long __v;
} __attribute__((__packed__, __may_alias__));
return (long long)__builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
}
/// Swap the bytes of a 64-bit value and store it to memory.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the MOVBE instruction.
///
/// \param __P
/// A pointer to the memory for storing the swapped value.
/// \param __D
/// The 64-bit value to be byte-swapped.
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
_storebe_i64(void * __P, long long __D) {
struct __storeu_i64 {
unsigned long long __v;
} __attribute__((__packed__, __may_alias__));
((struct __storeu_i64*)__P)->__v = __builtin_bswap64((unsigned long long)__D);
}
#endif
#endif /* __MOVBE */
#if !defined(__SCE__) || __has_feature(modules) || defined(__RTM__)
#include "rtmintrin.h"
#include "xtestintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA__)
#include "shaintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__FXSR__)
#include "fxsrintrin.h"
#endif
/* No feature check desired due to internal MSC_VER checks */
#include "xsaveintrin.h"
#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEOPT__)
#include "xsaveoptintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEC__)
#include "xsavecintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVES__)
#include "xsavesintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SHSTK__)
#include "cetintrin.h"
#endif
/* Intrinsics inside adcintrin.h are available at all times. */
#include "adcintrin.h"
#if !defined(__SCE__) || __has_feature(modules) || defined(__ADX__)
#include "adxintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__RDSEED__)
#include "rdseedintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__WBNOINVD__)
#include "wbnoinvdintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__CLDEMOTE__)
#include "cldemoteintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__WAITPKG__)
#include "waitpkgintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVDIRI__) || \
defined(__MOVDIR64B__)
#include "movdirintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
#include "pconfigintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SGX__)
#include "sgxintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__PTWRITE__)
#include "ptwriteintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
#include "invpcidintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
#include "amxfp16intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) || \
defined(__WIDEKL__)
#include "keylockerintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TILE__) || \
defined(__AMX_INT8__) || defined(__AMX_BF16__)
#include "amxintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
#include "amxcomplexintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
defined(__AVX512VP2INTERSECT__)
#include "avx512vp2intersectintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || \
(defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
#include "avx512vlvp2intersectintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
#include "enqcmdintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__SERIALIZE__)
#include "serializeintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__TSXLDTRK__)
#include "tsxldtrkintrin.h"
#endif
#if defined(_MSC_VER) && __has_extension(gnu_asm)
/* Define the default attributes for these intrinsics */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#ifdef __cplusplus
extern "C" {
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Exchange HLE
\*----------------------------------------------------------------------------*/
#if defined(__i386__) || defined(__x86_64__)
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
#endif
#if defined(__x86_64__)
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Compare Exchange HLE
\*----------------------------------------------------------------------------*/
#if defined(__i386__) || defined(__x86_64__)
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
long _Exchange, long _Comparand) {
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
}
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
long _Exchange, long _Comparand) {
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
}
#endif
#if defined(__x86_64__)
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
__int64 _Exchange, __int64 _Comparand) {
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
}
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
__int64 _Exchange, __int64 _Comparand) {
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
}
#endif
#ifdef __cplusplus
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */
#endif /* __IMMINTRIN_H */

23
third_party/intel/clang/invpcidintrin.h vendored Normal file
View file

@ -0,0 +1,23 @@
/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <invpcidintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __INVPCIDINTRIN_H
#define __INVPCIDINTRIN_H
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("invpcid")))
_invpcid(unsigned int __type, void *__descriptor) {
__builtin_ia32_invpcid(__type, __descriptor);
}
#endif /* __INVPCIDINTRIN_H */

View file

@ -0,0 +1,527 @@
/*===----------------- keylockerintrin.h - KL Intrinsics -------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <keylockerintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _KEYLOCKERINTRIN_H
#define _KEYLOCKERINTRIN_H
#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__)
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("kl"),\
__min_vector_width__(128)))
/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl
/// will assigned to EAX, whch specifies the KeySource and whether backing up
/// the key is permitted. The 256-bit encryption key is loaded from the two
/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is
/// loaded from the implicit operand XMM0 which assigned by __intkey.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
///
/// \code{.operation}
/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
/// GP (0)
/// FI
/// IF “LOADIWKEY exiting” VM execution control set
/// VMexit
/// FI
/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used
/// GP (0)
/// FI
/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set
/// GP (0)
/// FI
/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part
/// GP (0)
/// FI
/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part
/// GP (0)
/// FI
/// IF (__ctl[4:1] == 0) // KeySource of 0.
/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0]:
/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0]
/// IWKey.IntegrityKey[127:0] := __intkey[127:0]
/// IWKey.NoBackup := __ctl[0]
/// IWKey.KeySource := __ctl[4:1]
/// ZF := 0
/// ELSE // KeySource of 1. See RDSEED definition for details of randomness
/// IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received
/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0]
/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128]
/// IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0]
/// IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256]
/// IWKey.NoBackup := __ctl[0]
/// IWKey.KeySource := __ctl[4:1]
/// ZF := 0
/// ELSE // Random data was not returned from RDSEED. IWKey was not loaded
/// ZF := 1
/// FI
/// FI
/// dst := ZF
/// OF := 0
/// SF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS
_mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
__m128i __enkey_lo, __m128i __enkey_hi) {
__builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl);
}
/// Wrap a 128-bit AES key from __key into a key handle and output in
/// ((__m128i*)__h) to ((__m128i*)__h) + 2 and a 32-bit value as return.
/// The explicit source operand __htype specifies handle restrictions.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
///
/// \code{.operation}
/// InputKey[127:0] := __key[127:0]
/// KeyMetadata[2:0] := __htype[2:0]
/// KeyMetadata[23:3] := 0 // Reserved for future usage
/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0)
/// KeyMetadata[127:28] := 0 // Reserved for future usage
/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0],
/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
/// dst[0] := IWKey.NoBackup
/// dst[4:1] := IWKey.KeySource[3:0]
/// dst[31:5] := 0
/// MEM[__h+127:__h] := Handle[127:0] // AAD
/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
/// OF := 0
/// SF := 0
/// ZF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
}
/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 3 and
/// a 32-bit value as return.
/// The explicit source operand __htype specifies handle restrictions.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
///
/// \code{.operation}
/// InputKey[127:0] := __key_lo[127:0]
/// InputKey[255:128] := __key_hi[255:128]
/// KeyMetadata[2:0] := __htype[2:0]
/// KeyMetadata[23:3] := 0 // Reserved for future usage
/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1)
/// KeyMetadata[127:28] := 0 // Reserved for future usage
/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0],
/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
/// dst[0] := IWKey.NoBackup
/// dst[4:1] := IWKey.KeySource[3:0]
/// dst[31:5] := 0
/// MEM[__h+127:__h] := Handle[127:0] // AAD
/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
/// OF := 0
/// SF := 0
/// ZF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
void *__h) {
return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo,
(__v2di)__key_hi, __h);
}
/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using
/// the 128-bit key in the handle from the __h. It stores the result in the
/// __odata. And return the affected ZF flag status.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
///
/// \code{.operation}
/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
/// (Handle[127:0] AND (CPL > 0)) ||
/// Handle[383:256] ||
/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
/// IF (IllegalHandle)
/// ZF := 1
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
/// IF (Authentic == 0)
/// ZF := 1
/// ELSE
/// MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey)
/// ZF := 0
/// FI
/// FI
/// dst := ZF
/// OF := 0
/// SF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
}
/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using
/// the 256-bit key in the handle from the __h. It stores the result in the
/// __odata. And return the affected ZF flag status.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
///
/// \code{.operation}
/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
/// (Handle[127:0] AND (CPL > 0)) ||
/// Handle[255:128] ||
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
/// IF (IllegalHandle)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
/// IF (Authentic == 0)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
/// ZF := 0
/// FI
/// FI
/// dst := ZF
/// OF := 0
/// SF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
}
/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using
/// the 128-bit key in the handle from the __h. It stores the result in the
/// __odata. And return the affected ZF flag status.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
///
/// \code{.operation}
/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
/// (Handle[127:0] AND (CPL > 0)) ||
/// Handle[383:256] ||
/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
/// IF (IllegalHandle)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
/// IF (Authentic == 0)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
/// ZF := 0
/// FI
/// FI
/// dst := ZF
/// OF := 0
/// SF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
}
/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using
/// the 256-bit key in the handle from the __h. It stores the result in the
/// __odata. And return the affected ZF flag status.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
///
/// \code{.operation}
/// Handle[511:0] := MEM[__h+511:__h]
/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
/// (Handle[127:0] AND (CPL > 0)) ||
/// Handle[383:256] ||
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
/// IF (IllegalHandle)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
/// IF (Authentic == 0)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
/// ZF := 0
/// FI
/// FI
/// dst := ZF
/// OF := 0
/// SF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
}
#undef __DEFAULT_FN_ATTRS
#endif /* !defined(__SCE__ || __has_feature(modules) || defined(__KL__) */
#if !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\
__min_vector_width__(128)))
/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
/// at __h and store each resultant block back from __odata to __odata+7. And
/// return the affected ZF flag status.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
///
/// \code{.operation}
/// Handle := MEM[__h+383:__h]
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
/// (Handle[127:0] AND (CPL > 0)) ||
/// Handle[255:128] ||
/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
/// IF (IllegalHandle)
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
/// IF Authentic == 0
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// FOR i := 0 to 7
/// __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
/// ENDFOR
/// ZF := 0
/// FI
/// FI
/// dst := ZF
/// OF := 0
/// SF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
(const __v2di *)__idata, __h);
}
/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
/// at __h and store each resultant block back from __odata to __odata+7. And
/// return the affected ZF flag status.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
///
/// \code{.operation}
/// Handle[511:0] := MEM[__h+511:__h]
/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
/// (Handle[127:0] AND (CPL > 0)) ||
/// Handle[255:128] ||
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
/// IF (IllegalHandle)
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
/// IF Authentic == 0
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// FOR i := 0 to 7
/// __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
/// ENDFOR
/// ZF := 0
/// FI
/// FI
/// dst := ZF
/// OF := 0
/// SF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
(const __v2di *)__idata, __h);
}
/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
/// at __h and store each resultant block back from __odata to __odata+7. And
/// return the affected ZF flag status.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
///
/// \code{.operation}
/// Handle[383:0] := MEM[__h+383:__h]
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
/// (Handle[127:0] AND (CPL > 0)) ||
/// Handle[255:128] ||
/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
/// IF (IllegalHandle)
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
/// IF Authentic == 0
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// FOR i := 0 to 7
/// __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
/// ENDFOR
/// ZF := 0
/// FI
/// FI
/// dst := ZF
/// OF := 0
/// SF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
(const __v2di *)__idata, __h);
}
/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
/// at __h and store each resultant block back from __odata to __odata+7. And
/// return the affected ZF flag status.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
///
/// \code{.operation}
/// Handle[511:0] := MEM[__h+511:__h]
/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
/// (Handle[127:0] AND (CPL > 0)) ||
/// Handle[255:128] ||
/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
/// If (IllegalHandle)
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
/// IF Authentic == 0
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// FOR i := 0 to 7
/// __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
/// ENDFOR
/// ZF := 0
/// FI
/// FI
/// dst := ZF
/// OF := 0
/// SF := 0
/// AF := 0
/// PF := 0
/// CF := 0
/// \endcode
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
(const __v2di *)__idata, __h);
}
#undef __DEFAULT_FN_ATTRS
#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__) \
*/
#endif /* _KEYLOCKERINTRIN_H */

136
third_party/intel/clang/lwpintrin.h vendored Normal file
View file

@ -0,0 +1,136 @@
/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __LWPINTRIN_H
#define __LWPINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
/// Parses the LWPCB at the specified address and enables
/// profiling if valid.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
///
/// \param __addr
/// Address to the new Lightweight Profiling Control Block (LWPCB). If the
/// LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
/// Lightweight Profiling.
static __inline__ void __DEFAULT_FN_ATTRS
__llwpcb (void *__addr)
{
__builtin_ia32_llwpcb(__addr);
}
/// Flushes the LWP state to memory and returns the address of the LWPCB.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
///
/// \return
/// Address to the current Lightweight Profiling Control Block (LWPCB).
/// If LWP is not currently enabled, returns NULL.
static __inline__ void* __DEFAULT_FN_ATTRS
__slwpcb (void)
{
return __builtin_ia32_slwpcb();
}
/// Inserts programmed event record into the LWP event ring buffer
/// and advances the ring buffer pointer.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
///
/// \param DATA2
/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
/// \param DATA1
/// A 32-bit value is inserted into the 32-bit Data1 field.
/// \param FLAGS
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
/// the event record overwrites the last record in the buffer, the MissedEvents
/// counter in the LWPCB is incremented, the head pointer is not advanced, and
/// 1 is returned. Otherwise 0 is returned.
#define __lwpins32(DATA2, DATA1, FLAGS) \
(__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
(unsigned int) (FLAGS)))
/// Decrements the LWP programmed value sample event counter. If the result is
/// negative, inserts an event record into the LWP event ring buffer in memory
/// and advances the ring buffer pointer.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
///
/// \param DATA2
/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
/// \param DATA1
/// A 32-bit value is inserted into the 32-bit Data1 field.
/// \param FLAGS
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
#define __lwpval32(DATA2, DATA1, FLAGS) \
(__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
(unsigned int) (FLAGS)))
#ifdef __x86_64__
/// Inserts programmed event record into the LWP event ring buffer
/// and advances the ring buffer pointer.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
///
/// \param DATA2
/// A 64-bit value is inserted into the 64-bit Data2 field.
/// \param DATA1
/// A 32-bit value is inserted into the 32-bit Data1 field.
/// \param FLAGS
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
/// the event record overwrites the last record in the buffer, the MissedEvents
/// counter in the LWPCB is incremented, the head pointer is not advanced, and
/// 1 is returned. Otherwise 0 is returned.
#define __lwpins64(DATA2, DATA1, FLAGS) \
(__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
(unsigned int) (FLAGS)))
/// Decrements the LWP programmed value sample event counter. If the result is
/// negative, inserts an event record into the LWP event ring buffer in memory
/// and advances the ring buffer pointer.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
///
/// \param DATA2
/// A 64-bit value is and inserted into the 64-bit Data2 field.
/// \param DATA1
/// A 32-bit value is inserted into the 32-bit Data1 field.
/// \param FLAGS
/// A 32-bit immediate value is inserted into the 32-bit Flags field.
#define __lwpval64(DATA2, DATA1, FLAGS) \
(__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
(unsigned int) (FLAGS)))
#endif
#undef __DEFAULT_FN_ATTRS
#endif /* __LWPINTRIN_H */

104
third_party/intel/clang/lzcntintrin.h vendored Normal file
View file

@ -0,0 +1,104 @@
/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __LZCNTINTRIN_H
#define __LZCNTINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
#ifndef _MSC_VER
/// Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c LZCNT instruction.
///
/// \param __X
/// An unsigned 16-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 16-bit integer containing the number of leading zero
/// bits in the operand.
#define __lzcnt16(X) __builtin_ia32_lzcnt_u16((unsigned short)(X))
#endif // _MSC_VER
/// Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c LZCNT instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of leading zero
/// bits in the operand.
/// \see _lzcnt_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__lzcnt32(unsigned int __X)
{
return __builtin_ia32_lzcnt_u32(__X);
}
/// Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c LZCNT instruction.
///
/// \param __X
/// An unsigned 32-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of leading zero
/// bits in the operand.
/// \see __lzcnt32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_lzcnt_u32(unsigned int __X)
{
return __builtin_ia32_lzcnt_u32(__X);
}
#ifdef __x86_64__
#ifndef _MSC_VER
/// Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c LZCNT instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of leading zero
/// bits in the operand.
/// \see _lzcnt_u64
#define __lzcnt64(X) __builtin_ia32_lzcnt_u64((unsigned long long)(X))
#endif // _MSC_VER
/// Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c LZCNT instruction.
///
/// \param __X
/// An unsigned 64-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of leading zero
/// bits in the operand.
/// \see __lzcnt64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_lzcnt_u64(unsigned long long __X)
{
return __builtin_ia32_lzcnt_u64(__X);
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif /* __LZCNTINTRIN_H */

67
third_party/intel/clang/mm_malloc.h vendored Normal file
View file

@ -0,0 +1,67 @@
/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __MM_MALLOC_H
#define __MM_MALLOC_H
#include <stdlib.h>
#ifdef _WIN32
#include <malloc.h>
#else
#ifndef __cplusplus
extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
#else
// Some systems (e.g. those with GNU libc) declare posix_memalign with an
// exception specifier. Via an "egregious workaround" in
// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
// redeclaration of glibc's declaration.
extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
#endif
#endif
#if !(defined(_WIN32) && defined(_mm_malloc))
static __inline__ void *__attribute__((__always_inline__, __nodebug__,
__malloc__, __alloc_size__(1),
__alloc_align__(2)))
_mm_malloc(size_t __size, size_t __align) {
if (__align == 1) {
return malloc(__size);
}
if (!(__align & (__align - 1)) && __align < sizeof(void *))
__align = sizeof(void *);
void *__mallocedMemory;
#if defined(__MINGW32__)
__mallocedMemory = __mingw_aligned_malloc(__size, __align);
#elif defined(_WIN32)
__mallocedMemory = _aligned_malloc(__size, __align);
#else
if (posix_memalign(&__mallocedMemory, __align, __size))
return 0;
#endif
return __mallocedMemory;
}
static __inline__ void __attribute__((__always_inline__, __nodebug__))
_mm_free(void *__p)
{
#if defined(__MINGW32__)
__mingw_aligned_free(__p);
#elif defined(_WIN32)
_aligned_free(__p);
#else
free(__p);
#endif
}
#endif
#endif /* __MM_MALLOC_H */

1556
third_party/intel/clang/mmintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

49
third_party/intel/clang/movdirintrin.h vendored Normal file
View file

@ -0,0 +1,49 @@
/*===------------------------- movdirintrin.h ------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _MOVDIRINTRIN_H
#define _MOVDIRINTRIN_H
/* Move doubleword as direct store */
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("movdiri")))
_directstoreu_u32 (void *__dst, unsigned int __value)
{
__builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value);
}
#ifdef __x86_64__
/* Move quadword as direct store */
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("movdiri")))
_directstoreu_u64 (void *__dst, unsigned long __value)
{
__builtin_ia32_directstore_u64((unsigned long *)__dst, __value);
}
#endif /* __x86_64__ */
/*
* movdir64b - Move 64 bytes as direct store.
* The destination must be 64 byte aligned, and the store is atomic.
* The source address has no alignment requirement, and the load from
* the source address is not atomic.
*/
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("movdir64b")))
_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src)
{
__builtin_ia32_movdir64b(__dst, __src);
}
#endif /* _MOVDIRINTRIN_H */

62
third_party/intel/clang/mwaitxintrin.h vendored Normal file
View file

@ -0,0 +1,62 @@
/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __MWAITXINTRIN_H
#define __MWAITXINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mwaitx")))
/// Establishes a linear address memory range to be monitored and puts
/// the processor in the monitor event pending state. Data stored in the
/// monitored address range causes the processor to exit the pending state.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c MONITORX instruction.
///
/// \param __p
/// The memory range to be monitored. The size of the range is determined by
/// CPUID function 0000_0005h.
/// \param __extensions
/// Optional extensions for the monitoring state.
/// \param __hints
/// Optional hints for the monitoring state.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
{
__builtin_ia32_monitorx(__p, __extensions, __hints);
}
/// Used with the \c MONITORX instruction to wait while the processor is in
/// the monitor event pending state. Data stored in the monitored address
/// range, or an interrupt, causes the processor to exit the pending state.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c MWAITX instruction.
///
/// \param __extensions
/// Optional extensions for the monitoring state, which can vary by
/// processor.
/// \param __hints
/// Optional hints for the monitoring state, which can vary by processor.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
{
__builtin_ia32_mwaitx(__extensions, __hints, __clock);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __MWAITXINTRIN_H */

20
third_party/intel/clang/nmmintrin.h vendored Normal file
View file

@ -0,0 +1,20 @@
/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __NMMINTRIN_H
#define __NMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
just include it now then. */
#include "smmintrin.h"
#endif /* __NMMINTRIN_H */

40
third_party/intel/clang/pconfigintrin.h vendored Normal file
View file

@ -0,0 +1,40 @@
/*===---- pconfigintrin.h - X86 platform configuration ---------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <pconfigintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __PCONFIGINTRIN_H
#define __PCONFIGINTRIN_H
#define __PCONFIG_KEY_PROGRAM 0x00000001
#if __has_extension(gnu_asm)
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("pconfig")))
static __inline unsigned int __DEFAULT_FN_ATTRS
_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
{
unsigned int __result;
__asm__ ("pconfig"
: "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
: "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
: "cc");
return __result;
}
#undef __DEFAULT_FN_ATTRS
#endif /* __has_extension(gnu_asm) */
#endif

34
third_party/intel/clang/pkuintrin.h vendored Normal file
View file

@ -0,0 +1,34 @@
/*===---- pkuintrin.h - PKU intrinsics -------------------------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __PKUINTRIN_H
#define __PKUINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_rdpkru_u32(void)
{
return __builtin_ia32_rdpkru();
}
static __inline__ void __DEFAULT_FN_ATTRS
_wrpkru(unsigned int __val)
{
__builtin_ia32_wrpkru(__val);
}
#undef __DEFAULT_FN_ATTRS
#endif

301
third_party/intel/clang/pmmintrin.h vendored Normal file
View file

@ -0,0 +1,301 @@
/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __PMMINTRIN_H
#define __PMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include "emmintrin.h"
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("sse3,no-evex512"), __min_vector_width__(128)))
/// Loads data from an unaligned memory location to elements in a 128-bit
/// vector.
///
/// If the address of the data is not 16-byte aligned, the instruction may
/// read two adjacent aligned blocks of memory to retrieve the requested
/// data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
///
/// \param __p
/// A pointer to a 128-bit integer vector containing integer values.
/// \returns A 128-bit vector containing the moved values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_lddqu_si128(__m128i_u const *__p)
{
return (__m128i)__builtin_ia32_lddqu((char const *)__p);
}
/// Adds the even-indexed values and subtracts the odd-indexed values of
/// two 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing the left source operand.
/// \param __b
/// A 128-bit vector of [4 x float] containing the right source operand.
/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
/// differences of both operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_addsub_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
}
/// Horizontally adds the adjacent pairs of values contained in two
/// 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
/// both operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_hadd_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
}
/// Horizontally subtracts the adjacent pairs of values contained in two
/// 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal differences between the values are stored in the lower
/// bits of the destination.
/// \param __b
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// The horizontal differences between the values are stored in the upper
/// bits of the destination.
/// \returns A 128-bit vector of [4 x float] containing the horizontal
/// differences of both operands.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_hsub_ps(__m128 __a, __m128 __b)
{
return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
}
/// Moves and duplicates odd-indexed values from a 128-bit vector
/// of [4 x float] to float values stored in a 128-bit vector of
/// [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float]. \n
/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
/// the destination. \n
/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
/// destination.
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
/// values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehdup_ps(__m128 __a)
{
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
}
/// Duplicates even-indexed values from a 128-bit vector of
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x float] \n
/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
/// the destination. \n
/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
/// destination.
/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
/// values.
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_moveldup_ps(__m128 __a)
{
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
}
/// Adds the even-indexed values and subtracts the odd-indexed values of
/// two 128-bit vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing the left source operand.
/// \param __b
/// A 128-bit vector of [2 x double] containing the right source operand.
/// \returns A 128-bit vector of [2 x double] containing the alternating sums
/// and differences of both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_addsub_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
}
/// Horizontally adds the pairs of values contained in two 128-bit
/// vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal sum of the values is stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal sum of the values is stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
/// both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_hadd_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
}
/// Horizontally subtracts the pairs of values contained in two 128-bit
/// vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal difference of the values is stored in the lower bits of
/// the destination.
/// \param __b
/// A 128-bit vector of [2 x double] containing one of the source operands.
/// The horizontal difference of the values is stored in the upper bits of
/// the destination.
/// \returns A 128-bit vector of [2 x double] containing the horizontal
/// differences of both operands.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_hsub_pd(__m128d __a, __m128d __b)
{
return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
}
/// Moves and duplicates one double-precision value to double-precision
/// values stored in a 128-bit vector of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128d _mm_loaddup_pd(double const *dp);
/// \endcode
///
/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
///
/// \param dp
/// A pointer to a double-precision value to be moved and duplicated.
/// \returns A 128-bit vector of [2 x double] containing the moved and
/// duplicated values.
#define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
/// Moves and duplicates the double-precision value in the lower bits of
/// a 128-bit vector of [2 x double] to double-precision values stored in a
/// 128-bit vector of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
/// [127:64] and [63:0] of the destination.
/// \returns A 128-bit vector of [2 x double] containing the moved and
/// duplicated values.
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_movedup_pd(__m128d __a)
{
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
}
/// Establishes a linear address memory range to be monitored and puts
/// the processor in the monitor event pending state. Data stored in the
/// monitored address range causes the processor to exit the pending state.
///
/// The \c MONITOR instruction can be used in kernel mode, and in other modes
/// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c MONITOR instruction.
///
/// \param __p
/// The memory range to be monitored. The size of the range is determined by
/// CPUID function 0000_0005h.
/// \param __extensions
/// Optional extensions for the monitoring state.
/// \param __hints
/// Optional hints for the monitoring state.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
{
__builtin_ia32_monitor(__p, __extensions, __hints);
}
/// Used with the \c MONITOR instruction to wait while the processor is in
/// the monitor event pending state. Data stored in the monitored address
/// range, or an interrupt, causes the processor to exit the pending state.
///
/// The \c MWAIT instruction can be used in kernel mode, and in other modes if
/// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c MWAIT instruction.
///
/// \param __extensions
/// Optional extensions for the monitoring state, which can vary by
/// processor.
/// \param __hints
/// Optional hints for the monitoring state, which can vary by processor.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_mwait(unsigned __extensions, unsigned __hints)
{
__builtin_ia32_mwait(__extensions, __hints);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __PMMINTRIN_H */

59
third_party/intel/clang/popcntintrin.h vendored Normal file
View file

@ -0,0 +1,59 @@
/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __POPCNTINTRIN_H
#define __POPCNTINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
#if defined(__cplusplus) && (__cplusplus >= 201103L)
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
#else
#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
#endif
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// An unsigned 32-bit integer operand.
/// \returns A 32-bit integer containing the number of bits with value 1 in the
/// source operand.
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_popcnt_u32(unsigned int __A)
{
return __builtin_popcount(__A);
}
#ifdef __x86_64__
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
///
/// \param __A
/// An unsigned 64-bit integer operand.
/// \returns A 64-bit integer containing the number of bits with value 1 in the
/// source operand.
static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_popcnt_u64(unsigned long long __A)
{
return __builtin_popcountll(__A);
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
#endif /* __POPCNTINTRIN_H */

61
third_party/intel/clang/prfchiintrin.h vendored Normal file
View file

@ -0,0 +1,61 @@
/*===---- prfchiintrin.h - PREFETCHI intrinsic -----------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __PRFCHIINTRIN_H
#define __PRFCHIINTRIN_H
#ifdef __x86_64__
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("prefetchi")))
/// Loads an instruction sequence containing the specified memory address into
/// all level cache.
///
/// Note that the effect of this intrinsic is dependent on the processor
/// implementation.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PREFETCHIT0 instruction.
///
/// \param __P
/// A pointer specifying the memory address to be prefetched.
static __inline__ void __DEFAULT_FN_ATTRS
_m_prefetchit0(volatile const void *__P) {
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wcast-qual"
__builtin_ia32_prefetchi((const void *)__P, 3 /* _MM_HINT_T0 */);
#pragma clang diagnostic pop
}
/// Loads an instruction sequence containing the specified memory address into
/// all but the first-level cache.
///
/// Note that the effect of this intrinsic is dependent on the processor
/// implementation.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PREFETCHIT1 instruction.
///
/// \param __P
/// A pointer specifying the memory address to be prefetched.
static __inline__ void __DEFAULT_FN_ATTRS
_m_prefetchit1(volatile const void *__P) {
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wcast-qual"
__builtin_ia32_prefetchi((const void *)__P, 2 /* _MM_HINT_T1 */);
#pragma clang diagnostic pop
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#endif /* __PRFCHWINTRIN_H */

60
third_party/intel/clang/prfchwintrin.h vendored Normal file
View file

@ -0,0 +1,60 @@
/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __PRFCHWINTRIN_H
#define __PRFCHWINTRIN_H
/// Loads a memory sequence containing the specified memory address into
/// all data cache levels.
///
/// The cache-coherency state is set to exclusive. Data can be read from
/// and written to the cache line without additional delay.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
///
/// \param __P
/// A pointer specifying the memory address to be prefetched.
static __inline__ void __attribute__((__always_inline__, __nodebug__))
_m_prefetch(void *__P)
{
__builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
}
/// Loads a memory sequence containing the specified memory address into
/// the L1 data cache and sets the cache-coherency state to modified.
///
/// This provides a hint to the processor that the cache line will be
/// modified. It is intended for use when the cache line will be written to
/// shortly after the prefetch is performed.
///
/// Note that the effect of this intrinsic is dependent on the processor
/// implementation.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PREFETCHW instruction.
///
/// \param __P
/// A pointer specifying the memory address to be prefetched.
static __inline__ void __attribute__((__always_inline__, __nodebug__))
_m_prefetchw(volatile const void *__P)
{
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wcast-qual"
__builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */);
#pragma clang diagnostic pop
}
#endif /* __PRFCHWINTRIN_H */

37
third_party/intel/clang/ptwriteintrin.h vendored Normal file
View file

@ -0,0 +1,37 @@
/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <ptwriteintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __PTWRITEINTRIN_H
#define __PTWRITEINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("ptwrite")))
static __inline__ void __DEFAULT_FN_ATTRS
_ptwrite32(unsigned int __value) {
__builtin_ia32_ptwrite32(__value);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS
_ptwrite64(unsigned long long __value) {
__builtin_ia32_ptwrite64(__value);
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#endif /* __PTWRITEINTRIN_H */

203
third_party/intel/clang/raointintrin.h vendored Normal file
View file

@ -0,0 +1,203 @@
/*===----------------------- raointintrin.h - RAOINT ------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86GPRINTRIN_H
#error "Never use <raointintrin.h> directly; include <x86gprintrin.h> instead."
#endif // __X86GPRINTRIN_H
#ifndef __RAOINTINTRIN_H
#define __RAOINTINTRIN_H
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("raoint")))
/// Atomically add a 32-bit value at memory operand \a __A and a 32-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AADD instruction.
///
/// \param __A
/// A pointer to a 32-bit memory location.
/// \param __B
/// A 32-bit integer value.
///
/// \code{.operation}
/// MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aadd_i32(int *__A, int __B) {
__builtin_ia32_aadd32((int *)__A, __B);
}
/// Atomically and a 32-bit value at memory operand \a __A and a 32-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AAND instruction.
///
/// \param __A
/// A pointer to a 32-bit memory location.
/// \param __B
/// A 32-bit integer value.
///
/// \code{.operation}
/// MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aand_i32(int *__A, int __B) {
__builtin_ia32_aand32((int *)__A, __B);
}
/// Atomically or a 32-bit value at memory operand \a __A and a 32-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AOR instruction.
///
/// \param __A
/// A pointer to a 32-bit memory location.
/// \param __B
/// A 32-bit integer value.
///
/// \code{.operation}
/// MEM[__A+31:__A] := MEM[__A+31:__A] OR __B[31:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aor_i32(int *__A, int __B) {
__builtin_ia32_aor32((int *)__A, __B);
}
/// Atomically xor a 32-bit value at memory operand \a __A and a 32-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AXOR instruction.
///
/// \param __A
/// A pointer to a 32-bit memory location.
/// \param __B
/// A 32-bit integer value.
///
/// \code{.operation}
/// MEM[__A+31:__A] := MEM[__A+31:__A] XOR __B[31:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _axor_i32(int *__A, int __B) {
__builtin_ia32_axor32((int *)__A, __B);
}
#ifdef __x86_64__
/// Atomically add a 64-bit value at memory operand \a __A and a 64-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AADD instruction.
///
/// \param __A
/// A pointer to a 64-bit memory location.
/// \param __B
/// A 64-bit integer value.
///
/// \code{.operation}
/// MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aadd_i64(long long *__A,
long long __B) {
__builtin_ia32_aadd64((long long *)__A, __B);
}
/// Atomically and a 64-bit value at memory operand \a __A and a 64-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AAND instruction.
///
/// \param __A
/// A pointer to a 64-bit memory location.
/// \param __B
/// A 64-bit integer value.
///
/// \code{.operation}
/// MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aand_i64(long long *__A,
long long __B) {
__builtin_ia32_aand64((long long *)__A, __B);
}
/// Atomically or a 64-bit value at memory operand \a __A and a 64-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AOR instruction.
///
/// \param __A
/// A pointer to a 64-bit memory location.
/// \param __B
/// A 64-bit integer value.
///
/// \code{.operation}
/// MEM[__A+63:__A] := MEM[__A+63:__A] OR __B[63:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _aor_i64(long long *__A,
long long __B) {
__builtin_ia32_aor64((long long *)__A, __B);
}
/// Atomically xor a 64-bit value at memory operand \a __A and a 64-bit \a __B,
/// and store the result to the same memory location.
///
/// This intrinsic should be used for contention or weak ordering. It may
/// result in bad performance for hot data used by single thread only.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c AXOR instruction.
///
/// \param __A
/// A pointer to a 64-bit memory location.
/// \param __B
/// A 64-bit integer value.
///
/// \code{.operation}
/// MEM[__A+63:__A] := MEM[__A+63:__A] XOR __B[63:0]
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS _axor_i64(long long *__A,
long long __B) {
__builtin_ia32_axor64((long long *)__A, __B);
}
#endif // __x86_64__
#undef __DEFAULT_FN_ATTRS
#endif // __RAOINTINTRIN_H

57
third_party/intel/clang/rdpruintrin.h vendored Normal file
View file

@ -0,0 +1,57 @@
/*===---- rdpruintrin.h - RDPRU intrinsics ---------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H
#error "Never use <rdpruintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __RDPRUINTRIN_H
#define __RDPRUINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("rdpru")))
/// Reads the content of a processor register.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> RDPRU </c> instruction.
///
/// \param reg_id
/// A processor register identifier.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__rdpru (int reg_id)
{
return __builtin_ia32_rdpru(reg_id);
}
#define __RDPRU_MPERF 0
#define __RDPRU_APERF 1
/// Reads the content of processor register MPERF.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic generates instruction <c> RDPRU </c> to read the value of
/// register MPERF.
#define __mperf() __builtin_ia32_rdpru(__RDPRU_MPERF)
/// Reads the content of processor register APERF.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic generates instruction <c> RDPRU </c> to read the value of
/// register APERF.
#define __aperf() __builtin_ia32_rdpru(__RDPRU_APERF)
#undef __DEFAULT_FN_ATTRS
#endif /* __RDPRUINTRIN_H */

105
third_party/intel/clang/rdseedintrin.h vendored Normal file
View file

@ -0,0 +1,105 @@
/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <rdseedintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __RDSEEDINTRIN_H
#define __RDSEEDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
/// Stores a hardware-generated 16-bit random value in the memory at \a __p.
///
/// The random number generator complies with NIST SP800-90B and SP800-90C.
///
/// \code{.operation}
/// IF HW_NRND_GEN.ready == 1
/// Store16(__p, HW_NRND_GEN.data)
/// result := 1
/// ELSE
/// Store16(__p, 0)
/// result := 0
/// END
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c RDSEED instruction.
///
/// \param __p
/// Pointer to memory for storing the 16-bit random number.
/// \returns 1 if a random number was generated, 0 if not.
static __inline__ int __DEFAULT_FN_ATTRS
_rdseed16_step(unsigned short *__p)
{
return (int) __builtin_ia32_rdseed16_step(__p);
}
/// Stores a hardware-generated 32-bit random value in the memory at \a __p.
///
/// The random number generator complies with NIST SP800-90B and SP800-90C.
///
/// \code{.operation}
/// IF HW_NRND_GEN.ready == 1
/// Store32(__p, HW_NRND_GEN.data)
/// result := 1
/// ELSE
/// Store32(__p, 0)
/// result := 0
/// END
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c RDSEED instruction.
///
/// \param __p
/// Pointer to memory for storing the 32-bit random number.
/// \returns 1 if a random number was generated, 0 if not.
static __inline__ int __DEFAULT_FN_ATTRS
_rdseed32_step(unsigned int *__p)
{
return (int) __builtin_ia32_rdseed32_step(__p);
}
#ifdef __x86_64__
/// Stores a hardware-generated 64-bit random value in the memory at \a __p.
///
/// The random number generator complies with NIST SP800-90B and SP800-90C.
///
/// \code{.operation}
/// IF HW_NRND_GEN.ready == 1
/// Store64(__p, HW_NRND_GEN.data)
/// result := 1
/// ELSE
/// Store64(__p, 0)
/// result := 0
/// END
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c RDSEED instruction.
///
/// \param __p
/// Pointer to memory for storing the 64-bit random number.
/// \returns 1 if a random number was generated, 0 if not.
static __inline__ int __DEFAULT_FN_ATTRS
_rdseed64_step(unsigned long long *__p)
{
return (int) __builtin_ia32_rdseed64_step(__p);
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif /* __RDSEEDINTRIN_H */

45
third_party/intel/clang/rtmintrin.h vendored Normal file
View file

@ -0,0 +1,45 @@
/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __RTMINTRIN_H
#define __RTMINTRIN_H
#define _XBEGIN_STARTED (~0u)
#define _XABORT_EXPLICIT (1 << 0)
#define _XABORT_RETRY (1 << 1)
#define _XABORT_CONFLICT (1 << 2)
#define _XABORT_CAPACITY (1 << 3)
#define _XABORT_DEBUG (1 << 4)
#define _XABORT_NESTED (1 << 5)
#define _XABORT_CODE(x) (((x) >> 24) & 0xFF)
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_xbegin(void)
{
return (unsigned int)__builtin_ia32_xbegin();
}
static __inline__ void __DEFAULT_FN_ATTRS
_xend(void)
{
__builtin_ia32_xend();
}
#define _xabort(imm) __builtin_ia32_xabort((imm))
#undef __DEFAULT_FN_ATTRS
#endif /* __RTMINTRIN_H */

View file

@ -0,0 +1,30 @@
/*===--------------- serializeintrin.h - serialize intrinsics --------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <serializeintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __SERIALIZEINTRIN_H
#define __SERIALIZEINTRIN_H
/// Serialize instruction fetch and execution.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> SERIALIZE </c> instruction.
///
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("serialize")))
_serialize (void)
{
__builtin_ia32_serialize ();
}
#endif /* __SERIALIZEINTRIN_H */

60
third_party/intel/clang/sgxintrin.h vendored Normal file
View file

@ -0,0 +1,60 @@
/*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <sgxintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __SGXINTRIN_H
#define __SGXINTRIN_H
#if __has_extension(gnu_asm)
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("sgx")))
static __inline unsigned int __DEFAULT_FN_ATTRS
_enclu_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
{
unsigned int __result;
__asm__ ("enclu"
: "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
: "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
: "cc");
return __result;
}
static __inline unsigned int __DEFAULT_FN_ATTRS
_encls_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
{
unsigned int __result;
__asm__ ("encls"
: "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
: "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
: "cc");
return __result;
}
static __inline unsigned int __DEFAULT_FN_ATTRS
_enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
{
unsigned int __result;
__asm__ ("enclv"
: "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
: "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
: "cc");
return __result;
}
#undef __DEFAULT_FN_ATTRS
#endif /* __has_extension(gnu_asm) */
#endif

200
third_party/intel/clang/sha512intrin.h vendored Normal file
View file

@ -0,0 +1,200 @@
/*===--------------- sha512intrin.h - SHA512 intrinsics -----------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <sha512intrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H
#ifndef __SHA512INTRIN_H
#define __SHA512INTRIN_H
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("sha512"), \
__min_vector_width__(256)))
/// This intrinisc is one of the two SHA512 message scheduling instructions.
/// The intrinsic performs an intermediate calculation for the next four
/// SHA512 message qwords. The calculated results are stored in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_sha512msg1_epi64(__m256i __A, __m128i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VSHA512MSG1 instruction.
///
/// \param __A
/// A 256-bit vector of [4 x long long].
/// \param __B
/// A 128-bit vector of [2 x long long].
/// \returns
/// A 256-bit vector of [4 x long long].
///
/// \code{.operation}
/// DEFINE ROR64(qword, n) {
/// count := n % 64
/// dest := (qword >> count) | (qword << (64 - count))
/// RETURN dest
/// }
/// DEFINE SHR64(qword, n) {
/// RETURN qword >> n
/// }
/// DEFINE s0(qword):
/// RETURN ROR64(qword,1) ^ ROR64(qword, 8) ^ SHR64(qword, 7)
/// }
/// W[4] := __B.qword[0]
/// W[3] := __A.qword[3]
/// W[2] := __A.qword[2]
/// W[1] := __A.qword[1]
/// W[0] := __A.qword[0]
/// dst.qword[3] := W[3] + s0(W[4])
/// dst.qword[2] := W[2] + s0(W[3])
/// dst.qword[1] := W[1] + s0(W[2])
/// dst.qword[0] := W[0] + s0(W[1])
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sha512msg1_epi64(__m256i __A, __m128i __B) {
return (__m256i)__builtin_ia32_vsha512msg1((__v4du)__A, (__v2du)__B);
}
/// This intrinisc is one of the two SHA512 message scheduling instructions.
/// The intrinsic performs the final calculation for the next four SHA512
/// message qwords. The calculated results are stored in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_sha512msg2_epi64(__m256i __A, __m256i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VSHA512MSG2 instruction.
///
/// \param __A
/// A 256-bit vector of [4 x long long].
/// \param __B
/// A 256-bit vector of [4 x long long].
/// \returns
/// A 256-bit vector of [4 x long long].
///
/// \code{.operation}
/// DEFINE ROR64(qword, n) {
/// count := n % 64
/// dest := (qword >> count) | (qword << (64 - count))
/// RETURN dest
/// }
/// DEFINE SHR64(qword, n) {
/// RETURN qword >> n
/// }
/// DEFINE s1(qword) {
/// RETURN ROR64(qword,19) ^ ROR64(qword, 61) ^ SHR64(qword, 6)
/// }
/// W[14] := __B.qword[2]
/// W[15] := __B.qword[3]
/// W[16] := __A.qword[0] + s1(W[14])
/// W[17] := __A.qword[1] + s1(W[15])
/// W[18] := __A.qword[2] + s1(W[16])
/// W[19] := __A.qword[3] + s1(W[17])
/// dst.qword[3] := W[19]
/// dst.qword[2] := W[18]
/// dst.qword[1] := W[17]
/// dst.qword[0] := W[16]
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sha512msg2_epi64(__m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vsha512msg2((__v4du)__A, (__v4du)__B);
}
/// This intrinisc performs two rounds of SHA512 operation using initial SHA512
/// state (C,D,G,H) from \a __A, an initial SHA512 state (A,B,E,F) from
/// \a __A, and a pre-computed sum of the next two round message qwords and
/// the corresponding round constants from \a __C (only the two lower qwords
/// of the third operand). The updated SHA512 state (A,B,E,F) is written to
/// \a __A, and \a __A can be used as the updated state (C,D,G,H) in later
/// rounds.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_sha512rnds2_epi64(__m256i __A, __m256i __B, __m128i __C)
/// \endcode
///
/// This intrinsic corresponds to the \c VSHA512RNDS2 instruction.
///
/// \param __A
/// A 256-bit vector of [4 x long long].
/// \param __B
/// A 256-bit vector of [4 x long long].
/// \param __C
/// A 128-bit vector of [2 x long long].
/// \returns
/// A 256-bit vector of [4 x long long].
///
/// \code{.operation}
/// DEFINE ROR64(qword, n) {
/// count := n % 64
/// dest := (qword >> count) | (qword << (64 - count))
/// RETURN dest
/// }
/// DEFINE SHR64(qword, n) {
/// RETURN qword >> n
/// }
/// DEFINE cap_sigma0(qword) {
/// RETURN ROR64(qword,28) ^ ROR64(qword, 34) ^ ROR64(qword, 39)
/// }
/// DEFINE cap_sigma1(qword) {
/// RETURN ROR64(qword,14) ^ ROR64(qword, 18) ^ ROR64(qword, 41)
/// }
/// DEFINE MAJ(a,b,c) {
/// RETURN (a & b) ^ (a & c) ^ (b & c)
/// }
/// DEFINE CH(e,f,g) {
/// RETURN (e & f) ^ (g & ~e)
/// }
/// A[0] := __B.qword[3]
/// B[0] := __B.qword[2]
/// C[0] := __C.qword[3]
/// D[0] := __C.qword[2]
/// E[0] := __B.qword[1]
/// F[0] := __B.qword[0]
/// G[0] := __C.qword[1]
/// H[0] := __C.qword[0]
/// WK[0]:= __A.qword[0]
/// WK[1]:= __A.qword[1]
/// FOR i := 0 to 1:
/// A[i+1] := CH(E[i], F[i], G[i]) +
/// cap_sigma1(E[i]) + WK[i] + H[i] +
/// MAJ(A[i], B[i], C[i]) +
/// cap_sigma0(A[i])
/// B[i+1] := A[i]
/// C[i+1] := B[i]
/// D[i+1] := C[i]
/// E[i+1] := CH(E[i], F[i], G[i]) +
/// cap_sigma1(E[i]) + WK[i] + H[i] + D[i]
/// F[i+1] := E[i]
/// G[i+1] := F[i]
/// H[i+1] := G[i]
/// ENDFOR
/// dst.qword[3] := A[2]
/// dst.qword[2] := B[2]
/// dst.qword[1] := E[2]
/// dst.qword[0] := F[2]
/// dst[MAX:256] := 0
/// \endcode
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sha512rnds2_epi64(__m256i __A, __m256i __B, __m128i __C) {
return (__m256i)__builtin_ia32_vsha512rnds2((__v4du)__A, (__v4du)__B,
(__v2du)__C);
}
#undef __DEFAULT_FN_ATTRS256
#endif // __SHA512INTRIN_H

189
third_party/intel/clang/shaintrin.h vendored Normal file
View file

@ -0,0 +1,189 @@
/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __SHAINTRIN_H
#define __SHAINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"), __min_vector_width__(128)))
/// Performs four iterations of the inner loop of the SHA-1 message digest
/// algorithm using the starting SHA-1 state (A, B, C, D) from the 128-bit
/// vector of [4 x i32] in \a V1 and the next four 32-bit elements of the
/// message from the 128-bit vector of [4 x i32] in \a V2. Note that the
/// SHA-1 state variable E must have already been added to \a V2
/// (\c _mm_sha1nexte_epu32() can perform this step). Returns the updated
/// SHA-1 state (A, B, C, D) as a 128-bit vector of [4 x i32].
///
/// The SHA-1 algorithm has an inner loop of 80 iterations, twenty each
/// with a different combining function and rounding constant. This
/// intrinsic performs four iterations using a combining function and
/// rounding constant selected by \a M[1:0].
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_sha1rnds4_epu32(__m128i V1, __m128i V2, const int M);
/// \endcode
///
/// This intrinsic corresponds to the \c SHA1RNDS4 instruction.
///
/// \param V1
/// A 128-bit vector of [4 x i32] containing the initial SHA-1 state.
/// \param V2
/// A 128-bit vector of [4 x i32] containing the next four elements of
/// the message, plus SHA-1 state variable E.
/// \param M
/// An immediate value where bits [1:0] select among four possible
/// combining functions and rounding constants (not specified here).
/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-1 state.
#define _mm_sha1rnds4_epu32(V1, V2, M) \
__builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M))
/// Calculates the SHA-1 state variable E from the SHA-1 state variables in
/// the 128-bit vector of [4 x i32] in \a __X, adds that to the next set of
/// four message elements in the 128-bit vector of [4 x i32] in \a __Y, and
/// returns the result.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c SHA1NEXTE instruction.
///
/// \param __X
/// A 128-bit vector of [4 x i32] containing the current SHA-1 state.
/// \param __Y
/// A 128-bit vector of [4 x i32] containing the next four elements of the
/// message.
/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-1
/// values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
}
/// Performs an intermediate calculation for deriving the next four SHA-1
/// message elements using previous message elements from the 128-bit
/// vectors of [4 x i32] in \a __X and \a __Y, and returns the result.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c SHA1MSG1 instruction.
///
/// \param __X
/// A 128-bit vector of [4 x i32] containing previous message elements.
/// \param __Y
/// A 128-bit vector of [4 x i32] containing previous message elements.
/// \returns A 128-bit vector of [4 x i32] containing the derived SHA-1
/// elements.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
}
/// Performs the final calculation for deriving the next four SHA-1 message
/// elements using previous message elements from the 128-bit vectors of
/// [4 x i32] in \a __X and \a __Y, and returns the result.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c SHA1MSG2 instruction.
///
/// \param __X
/// A 128-bit vector of [4 x i32] containing an intermediate result.
/// \param __Y
/// A 128-bit vector of [4 x i32] containing previous message values.
/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-1
/// values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
}
/// Performs two rounds of SHA-256 operation using the following inputs: a
/// starting SHA-256 state (C, D, G, H) from the 128-bit vector of
/// [4 x i32] in \a __X; a starting SHA-256 state (A, B, E, F) from the
/// 128-bit vector of [4 x i32] in \a __Y; and a pre-computed sum of the
/// next two message elements (unsigned 32-bit integers) and corresponding
/// rounding constants from the 128-bit vector of [4 x i32] in \a __Z.
/// Returns the updated SHA-256 state (A, B, E, F) as a 128-bit vector of
/// [4 x i32].
///
/// The SHA-256 algorithm has a core loop of 64 iterations. This intrinsic
/// performs two of those iterations.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c SHA256RNDS2 instruction.
///
/// \param __X
/// A 128-bit vector of [4 x i32] containing part of the initial SHA-256
/// state.
/// \param __Y
/// A 128-bit vector of [4 x i32] containing part of the initial SHA-256
/// state.
/// \param __Z
/// A 128-bit vector of [4 x i32] containing additional input to the
/// SHA-256 operation.
/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-1 state.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
}
/// Performs an intermediate calculation for deriving the next four SHA-256
/// message elements using previous message elements from the 128-bit
/// vectors of [4 x i32] in \a __X and \a __Y, and returns the result.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c SHA256MSG1 instruction.
///
/// \param __X
/// A 128-bit vector of [4 x i32] containing previous message elements.
/// \param __Y
/// A 128-bit vector of [4 x i32] containing previous message elements.
/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-256
/// values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
}
/// Performs the final calculation for deriving the next four SHA-256 message
/// elements using previous message elements from the 128-bit vectors of
/// [4 x i32] in \a __X and \a __Y, and returns the result.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c SHA256MSG2 instruction.
///
/// \param __X
/// A 128-bit vector of [4 x i32] containing an intermediate result.
/// \param __Y
/// A 128-bit vector of [4 x i32] containing previous message values.
/// \returns A 128-bit vector of [4 x i32] containing the updated SHA-256
/// values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __SHAINTRIN_H */

238
third_party/intel/clang/sm3intrin.h vendored Normal file
View file

@ -0,0 +1,238 @@
/*===-------------------- sm3intrin.h - SM3 intrinsics ---------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <sm3intrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H
#ifndef __SM3INTRIN_H
#define __SM3INTRIN_H
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("sm3"), \
__min_vector_width__(128)))
/// This intrinisc is one of the two SM3 message scheduling intrinsics. The
/// intrinsic performs an initial calculation for the next four SM3 message
/// words. The calculated results are stored in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_sm3msg1_epi32(__m128i __A, __m128i __B, __m128i __C)
/// \endcode
///
/// This intrinsic corresponds to the \c VSM3MSG1 instruction.
///
/// \param __A
/// A 128-bit vector of [4 x int].
/// \param __B
/// A 128-bit vector of [4 x int].
/// \param __C
/// A 128-bit vector of [4 x int].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// DEFINE ROL32(dword, n) {
/// count := n % 32
/// dest := (dword << count) | (dword >> (32 - count))
/// RETURN dest
/// }
/// DEFINE P1(x) {
/// RETURN x ^ ROL32(x, 15) ^ ROL32(x, 23)
/// }
/// W[0] := __C.dword[0]
/// W[1] := __C.dword[1]
/// W[2] := __C.dword[2]
/// W[3] := __C.dword[3]
/// W[7] := __A.dword[0]
/// W[8] := __A.dword[1]
/// W[9] := __A.dword[2]
/// W[10] := __A.dword[3]
/// W[13] := __B.dword[0]
/// W[14] := __B.dword[1]
/// W[15] := __B.dword[2]
/// TMP0 := W[7] ^ W[0] ^ ROL32(W[13], 15)
/// TMP1 := W[8] ^ W[1] ^ ROL32(W[14], 15)
/// TMP2 := W[9] ^ W[2] ^ ROL32(W[15], 15)
/// TMP3 := W[10] ^ W[3]
/// dst.dword[0] := P1(TMP0)
/// dst.dword[1] := P1(TMP1)
/// dst.dword[2] := P1(TMP2)
/// dst.dword[3] := P1(TMP3)
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sm3msg1_epi32(__m128i __A,
__m128i __B,
__m128i __C) {
return (__m128i)__builtin_ia32_vsm3msg1((__v4su)__A, (__v4su)__B,
(__v4su)__C);
}
/// This intrinisc is one of the two SM3 message scheduling intrinsics. The
/// intrinsic performs the final calculation for the next four SM3 message
/// words. The calculated results are stored in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_sm3msg2_epi32(__m128i __A, __m128i __B, __m128i __C)
/// \endcode
///
/// This intrinsic corresponds to the \c VSM3MSG2 instruction.
///
/// \param __A
/// A 128-bit vector of [4 x int].
/// \param __B
/// A 128-bit vector of [4 x int].
/// \param __C
/// A 128-bit vector of [4 x int].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// DEFINE ROL32(dword, n) {
/// count := n % 32
/// dest := (dword << count) | (dword >> (32-count))
/// RETURN dest
/// }
/// WTMP[0] := __A.dword[0]
/// WTMP[1] := __A.dword[1]
/// WTMP[2] := __A.dword[2]
/// WTMP[3] := __A.dword[3]
/// W[3] := __B.dword[0]
/// W[4] := __B.dword[1]
/// W[5] := __B.dword[2]
/// W[6] := __B.dword[3]
/// W[10] := __C.dword[0]
/// W[11] := __C.dword[1]
/// W[12] := __C.dword[2]
/// W[13] := __C.dword[3]
/// W[16] := ROL32(W[3], 7) ^ W[10] ^ WTMP[0]
/// W[17] := ROL32(W[4], 7) ^ W[11] ^ WTMP[1]
/// W[18] := ROL32(W[5], 7) ^ W[12] ^ WTMP[2]
/// W[19] := ROL32(W[6], 7) ^ W[13] ^ WTMP[3]
/// W[19] := W[19] ^ ROL32(W[16], 6) ^ ROL32(W[16], 15) ^ ROL32(W[16], 30)
/// dst.dword[0] := W[16]
/// dst.dword[1] := W[17]
/// dst.dword[2] := W[18]
/// dst.dword[3] := W[19]
/// dst[MAX:128] := 0
/// \endcode
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sm3msg2_epi32(__m128i __A,
__m128i __B,
__m128i __C) {
return (__m128i)__builtin_ia32_vsm3msg2((__v4su)__A, (__v4su)__B,
(__v4su)__C);
}
/// This intrinsic performs two rounds of SM3 operation using initial SM3 state
/// (C, D, G, H) from \a __A, an initial SM3 states (A, B, E, F)
/// from \a __B and a pre-computed words from the \a __C. \a __A with
/// initial SM3 state of (C, D, G, H) assumes input of non-rotated left
/// variables from previous state. The updated SM3 state (A, B, E, F) is
/// written to \a __A. The \a imm8 should contain the even round number
/// for the first of the two rounds computed by this instruction. The
/// computation masks the \a imm8 value by ANDing it with 0x3E so that only
/// even round numbers from 0 through 62 are used for this operation. The
/// calculated results are stored in \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_sm3rnds2_epi32(__m128i __A, __m128i __B, __m128i __C, const int
/// imm8) \endcode
///
/// This intrinsic corresponds to the \c VSM3RNDS2 instruction.
///
/// \param __A
/// A 128-bit vector of [4 x int].
/// \param __B
/// A 128-bit vector of [4 x int].
/// \param __C
/// A 128-bit vector of [4 x int].
/// \param imm8
/// A 8-bit constant integer.
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// DEFINE ROL32(dword, n) {
/// count := n % 32
/// dest := (dword << count) | (dword >> (32-count))
/// RETURN dest
/// }
/// DEFINE P0(dword) {
/// RETURN dword ^ ROL32(dword, 9) ^ ROL32(dword, 17)
/// }
/// DEFINE FF(x,y,z, round){
/// IF round < 16
/// RETURN (x ^ y ^ z)
/// ELSE
/// RETURN (x & y) | (x & z) | (y & z)
/// FI
/// }
/// DEFINE GG(x, y, z, round){
/// IF round < 16
/// RETURN (x ^ y ^ z)
/// ELSE
/// RETURN (x & y) | (~x & z)
/// FI
/// }
/// A[0] := __B.dword[3]
/// B[0] := __B.dword[2]
/// C[0] := __A.dword[3]
/// D[0] := __A.dword[2]
/// E[0] := __B.dword[1]
/// F[0] := __B.dword[0]
/// G[0] := __A.dword[1]
/// H[0] := __A.dword[0]
/// W[0] := __C.dword[0]
/// W[1] := __C.dword[1]
/// W[4] := __C.dword[2]
/// W[5] := __C.dword[3]
/// C[0] := ROL32(C[0], 9)
/// D[0] := ROL32(D[0], 9)
/// G[0] := ROL32(G[0], 19)
/// H[0] := ROL32(H[0], 19)
/// ROUND := __D & 0x3E
/// IF ROUND < 16
/// CONST := 0x79CC4519
/// ELSE
/// CONST := 0x7A879D8A
/// FI
/// CONST := ROL32(CONST,ROUND)
/// FOR i:= 0 to 1
/// S1 := ROL32((ROL32(A[i], 12) + E[i] + CONST), 7)
/// S2 := S1 ^ ROL32(A[i], 12)
/// T1 := FF(A[i], B[i], C[i], ROUND) + D[i] + S2 + (W[i] ^ W[i+4])
/// T2 := GG(E[i], F[i], G[i], ROUND) + H[i] + S1 + W[i]
/// D[i+1] := C[i]
/// C[i+1] := ROL32(B[i],9)
/// B[i+1] := A[i]
/// A[i+1] := T1
/// H[i+1] := G[i]
/// G[i+1] := ROL32(F[i], 19)
/// F[i+1] := E[i]
/// E[i+1] := P0(T2)
/// CONST := ROL32(CONST, 1)
/// ENDFOR
/// dst.dword[3] := A[2]
/// dst.dword[2] := B[2]
/// dst.dword[1] := E[2]
/// dst.dword[0] := F[2]
/// dst[MAX:128] := 0
/// \endcode
#define _mm_sm3rnds2_epi32(A, B, C, D) \
(__m128i) __builtin_ia32_vsm3rnds2((__v4su)A, (__v4su)B, (__v4su)C, (int)D)
#undef __DEFAULT_FN_ATTRS128
#endif // __SM3INTRIN_H

269
third_party/intel/clang/sm4intrin.h vendored Normal file
View file

@ -0,0 +1,269 @@
/*===--------------- sm4intrin.h - SM4 intrinsics -----------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <sm4intrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H
#ifndef __SM4INTRIN_H
#define __SM4INTRIN_H
/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
/// operates on independent 128-bit lanes. The calculated results are
/// stored in \a dst.
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
///
/// \param __A
/// A 128-bit vector of [4 x int].
/// \param __B
/// A 128-bit vector of [4 x int].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// DEFINE ROL32(dword, n) {
/// count := n % 32
/// dest := (dword << count) | (dword >> (32-count))
/// RETURN dest
/// }
/// DEFINE SBOX_BYTE(dword, i) {
/// RETURN sbox[dword.byte[i]]
/// }
/// DEFINE lower_t(dword) {
/// tmp.byte[0] := SBOX_BYTE(dword, 0)
/// tmp.byte[1] := SBOX_BYTE(dword, 1)
/// tmp.byte[2] := SBOX_BYTE(dword, 2)
/// tmp.byte[3] := SBOX_BYTE(dword, 3)
/// RETURN tmp
/// }
/// DEFINE L_KEY(dword) {
/// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
/// }
/// DEFINE T_KEY(dword) {
/// RETURN L_KEY(lower_t(dword))
/// }
/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
/// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
/// }
/// FOR i:= 0 to 0
/// P[0] := __B.xmm[i].dword[0]
/// P[1] := __B.xmm[i].dword[1]
/// P[2] := __B.xmm[i].dword[2]
/// P[3] := __B.xmm[i].dword[3]
/// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
/// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
/// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
/// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
/// DEST.xmm[i].dword[0] := C[0]
/// DEST.xmm[i].dword[1] := C[1]
/// DEST.xmm[i].dword[2] := C[2]
/// DEST.xmm[i].dword[3] := C[3]
/// ENDFOR
/// DEST[MAX:128] := 0
/// \endcode
#define _mm_sm4key4_epi32(A, B) \
(__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B)
/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
/// operates on independent 128-bit lanes. The calculated results are
/// stored in \a dst.
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
///
/// \param __A
/// A 256-bit vector of [8 x int].
/// \param __B
/// A 256-bit vector of [8 x int].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// DEFINE ROL32(dword, n) {
/// count := n % 32
/// dest := (dword << count) | (dword >> (32-count))
/// RETURN dest
/// }
/// DEFINE SBOX_BYTE(dword, i) {
/// RETURN sbox[dword.byte[i]]
/// }
/// DEFINE lower_t(dword) {
/// tmp.byte[0] := SBOX_BYTE(dword, 0)
/// tmp.byte[1] := SBOX_BYTE(dword, 1)
/// tmp.byte[2] := SBOX_BYTE(dword, 2)
/// tmp.byte[3] := SBOX_BYTE(dword, 3)
/// RETURN tmp
/// }
/// DEFINE L_KEY(dword) {
/// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
/// }
/// DEFINE T_KEY(dword) {
/// RETURN L_KEY(lower_t(dword))
/// }
/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
/// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
/// }
/// FOR i:= 0 to 1
/// P[0] := __B.xmm[i].dword[0]
/// P[1] := __B.xmm[i].dword[1]
/// P[2] := __B.xmm[i].dword[2]
/// P[3] := __B.xmm[i].dword[3]
/// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
/// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
/// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
/// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
/// DEST.xmm[i].dword[0] := C[0]
/// DEST.xmm[i].dword[1] := C[1]
/// DEST.xmm[i].dword[2] := C[2]
/// DEST.xmm[i].dword[3] := C[3]
/// ENDFOR
/// DEST[MAX:256] := 0
/// \endcode
#define _mm256_sm4key4_epi32(A, B) \
(__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B)
/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
/// operates on independent 128-bit lanes. The calculated results are
/// stored in \a dst.
/// \headerfile <immintrin.h>
///
/// \code
/// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
///
/// \param __A
/// A 128-bit vector of [4 x int].
/// \param __B
/// A 128-bit vector of [4 x int].
/// \returns
/// A 128-bit vector of [4 x int].
///
/// \code{.operation}
/// DEFINE ROL32(dword, n) {
/// count := n % 32
/// dest := (dword << count) | (dword >> (32-count))
/// RETURN dest
/// }
/// DEFINE lower_t(dword) {
/// tmp.byte[0] := SBOX_BYTE(dword, 0)
/// tmp.byte[1] := SBOX_BYTE(dword, 1)
/// tmp.byte[2] := SBOX_BYTE(dword, 2)
/// tmp.byte[3] := SBOX_BYTE(dword, 3)
/// RETURN tmp
/// }
/// DEFINE L_RND(dword) {
/// tmp := dword
/// tmp := tmp ^ ROL32(dword, 2)
/// tmp := tmp ^ ROL32(dword, 10)
/// tmp := tmp ^ ROL32(dword, 18)
/// tmp := tmp ^ ROL32(dword, 24)
/// RETURN tmp
/// }
/// DEFINE T_RND(dword) {
/// RETURN L_RND(lower_t(dword))
/// }
/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
/// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
/// }
/// FOR i:= 0 to 0
/// P[0] := __B.xmm[i].dword[0]
/// P[1] := __B.xmm[i].dword[1]
/// P[2] := __B.xmm[i].dword[2]
/// P[3] := __B.xmm[i].dword[3]
/// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
/// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
/// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
/// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
/// DEST.xmm[i].dword[0] := C[0]
/// DEST.xmm[i].dword[1] := C[1]
/// DEST.xmm[i].dword[2] := C[2]
/// DEST.xmm[i].dword[3] := C[3]
/// ENDFOR
/// DEST[MAX:128] := 0
/// \endcode
#define _mm_sm4rnds4_epi32(A, B) \
(__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B)
/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
/// operates on independent 128-bit lanes. The calculated results are
/// stored in \a dst.
/// \headerfile <immintrin.h>
///
/// \code
/// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B)
/// \endcode
///
/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
///
/// \param __A
/// A 256-bit vector of [8 x int].
/// \param __B
/// A 256-bit vector of [8 x int].
/// \returns
/// A 256-bit vector of [8 x int].
///
/// \code{.operation}
/// DEFINE ROL32(dword, n) {
/// count := n % 32
/// dest := (dword << count) | (dword >> (32-count))
/// RETURN dest
/// }
/// DEFINE lower_t(dword) {
/// tmp.byte[0] := SBOX_BYTE(dword, 0)
/// tmp.byte[1] := SBOX_BYTE(dword, 1)
/// tmp.byte[2] := SBOX_BYTE(dword, 2)
/// tmp.byte[3] := SBOX_BYTE(dword, 3)
/// RETURN tmp
/// }
/// DEFINE L_RND(dword) {
/// tmp := dword
/// tmp := tmp ^ ROL32(dword, 2)
/// tmp := tmp ^ ROL32(dword, 10)
/// tmp := tmp ^ ROL32(dword, 18)
/// tmp := tmp ^ ROL32(dword, 24)
/// RETURN tmp
/// }
/// DEFINE T_RND(dword) {
/// RETURN L_RND(lower_t(dword))
/// }
/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
/// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
/// }
/// FOR i:= 0 to 0
/// P[0] := __B.xmm[i].dword[0]
/// P[1] := __B.xmm[i].dword[1]
/// P[2] := __B.xmm[i].dword[2]
/// P[3] := __B.xmm[i].dword[3]
/// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
/// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
/// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
/// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
/// DEST.xmm[i].dword[0] := C[0]
/// DEST.xmm[i].dword[1] := C[1]
/// DEST.xmm[i].dword[2] := C[2]
/// DEST.xmm[i].dword[3] := C[3]
/// ENDFOR
/// DEST[MAX:256] := 0
/// \endcode
#define _mm256_sm4rnds4_epi32(A, B) \
(__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B)
#endif // __SM4INTRIN_H

2328
third_party/intel/clang/smmintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

140
third_party/intel/clang/tbmintrin.h vendored Normal file
View file

@ -0,0 +1,140 @@
/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __TBMINTRIN_H
#define __TBMINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
#define __bextri_u32(a, b) \
((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
(unsigned int)(b)))
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blcfill_u32(unsigned int __a)
{
return __a & (__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blci_u32(unsigned int __a)
{
return __a | ~(__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blcic_u32(unsigned int __a)
{
return ~__a & (__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blcmsk_u32(unsigned int __a)
{
return __a ^ (__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blcs_u32(unsigned int __a)
{
return __a | (__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blsfill_u32(unsigned int __a)
{
return __a | (__a - 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__blsic_u32(unsigned int __a)
{
return ~__a | (__a - 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__t1mskc_u32(unsigned int __a)
{
return ~__a | (__a + 1);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__tzmsk_u32(unsigned int __a)
{
return ~__a & (__a - 1);
}
#ifdef __x86_64__
#define __bextri_u64(a, b) \
((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
(unsigned long long)(b)))
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blcfill_u64(unsigned long long __a)
{
return __a & (__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blci_u64(unsigned long long __a)
{
return __a | ~(__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blcic_u64(unsigned long long __a)
{
return ~__a & (__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blcmsk_u64(unsigned long long __a)
{
return __a ^ (__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blcs_u64(unsigned long long __a)
{
return __a | (__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsfill_u64(unsigned long long __a)
{
return __a | (__a - 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__blsic_u64(unsigned long long __a)
{
return ~__a | (__a - 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__t1mskc_u64(unsigned long long __a)
{
return ~__a | (__a + 1);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__tzmsk_u64(unsigned long long __a)
{
return ~__a & (__a - 1);
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif /* __TBMINTRIN_H */

784
third_party/intel/clang/tmmintrin.h vendored Normal file
View file

@ -0,0 +1,784 @@
/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __TMMINTRIN_H
#define __TMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include "pmmintrin.h"
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("ssse3,no-evex512"), __min_vector_width__(64)))
#define __DEFAULT_FN_ATTRS_MMX \
__attribute__((__always_inline__, __nodebug__, \
__target__("mmx,ssse3,no-evex512"), \
__min_vector_width__(64)))
/// Computes the absolute value of each of the packed 8-bit signed
/// integers in the source operand and stores the 8-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PABSB instruction.
///
/// \param __a
/// A 64-bit vector of [8 x i8].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_abs_pi8(__m64 __a)
{
return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
}
/// Computes the absolute value of each of the packed 8-bit signed
/// integers in the source operand and stores the 8-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPABSB instruction.
///
/// \param __a
/// A 128-bit vector of [16 x i8].
/// \returns A 128-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi8(__m128i __a)
{
return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
}
/// Computes the absolute value of each of the packed 16-bit signed
/// integers in the source operand and stores the 16-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PABSW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_abs_pi16(__m64 __a)
{
return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
}
/// Computes the absolute value of each of the packed 16-bit signed
/// integers in the source operand and stores the 16-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPABSW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16].
/// \returns A 128-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi16(__m128i __a)
{
return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
}
/// Computes the absolute value of each of the packed 32-bit signed
/// integers in the source operand and stores the 32-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PABSD instruction.
///
/// \param __a
/// A 64-bit vector of [2 x i32].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_abs_pi32(__m64 __a)
{
return (__m64)__builtin_ia32_pabsd((__v2si)__a);
}
/// Computes the absolute value of each of the packed 32-bit signed
/// integers in the source operand and stores the 32-bit unsigned integer
/// results in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPABSD instruction.
///
/// \param __a
/// A 128-bit vector of [4 x i32].
/// \returns A 128-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi32(__m128i __a)
{
return (__m128i)__builtin_elementwise_abs((__v4si)__a);
}
/// Concatenates the two 128-bit integer vector operands, and
/// right-shifts the result by the number of bytes specified in the immediate
/// operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
/// \endcode
///
/// This intrinsic corresponds to the \c PALIGNR instruction.
///
/// \param a
/// A 128-bit vector of [16 x i8] containing one of the source operands.
/// \param b
/// A 128-bit vector of [16 x i8] containing one of the source operands.
/// \param n
/// An immediate operand specifying how many bytes to right-shift the result.
/// \returns A 128-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_epi8(a, b, n) \
((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (n)))
/// Concatenates the two 64-bit integer vector operands, and right-shifts
/// the result by the number of bytes specified in the immediate operand.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
/// \endcode
///
/// This intrinsic corresponds to the \c PALIGNR instruction.
///
/// \param a
/// A 64-bit vector of [8 x i8] containing one of the source operands.
/// \param b
/// A 64-bit vector of [8 x i8] containing one of the source operands.
/// \param n
/// An immediate operand specifying how many bytes to right-shift the result.
/// \returns A 64-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_pi8(a, b, n) \
((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHADDW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
/// both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadd_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
}
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [4 x i32].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHADDD instruction.
///
/// \param __a
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
/// both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadd_epi32(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
}
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 64-bit vectors of [4 x i16].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHADDW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
/// operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hadd_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
}
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 64-bit vectors of [2 x i32].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHADDD instruction.
///
/// \param __a
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
/// operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hadd_pi32(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
}
/// Horizontally adds, with saturation, the adjacent pairs of values contained
/// in two packed 128-bit vectors of [8 x i16].
///
/// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
/// less than 0x8000 are saturated to 0x8000.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHADDSW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
/// sums of both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadds_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
}
/// Horizontally adds, with saturation, the adjacent pairs of values contained
/// in two packed 64-bit vectors of [4 x i16].
///
/// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
/// less than 0x8000 are saturated to 0x8000.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHADDSW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the lower bits of the
/// destination.
/// \param __b
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal sums of the values are stored in the upper bits of the
/// destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
/// sums of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hadds_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
}
/// Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 128-bit vectors of [8 x i16].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHSUBW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
/// of both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsub_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
}
/// Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 128-bit vectors of [4 x i32].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHSUBD instruction.
///
/// \param __a
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 128-bit vector of [4 x i32] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
/// of both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsub_epi32(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
}
/// Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 64-bit vectors of [4 x i16].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHSUBW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
/// of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hsub_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
}
/// Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 64-bit vectors of [2 x i32].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHSUBD instruction.
///
/// \param __a
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 64-bit vector of [2 x i32] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
/// of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hsub_pi32(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
}
/// Horizontally subtracts, with saturation, the adjacent pairs of values
/// contained in two packed 128-bit vectors of [8 x i16].
///
/// Positive differences greater than 0x7FFF are saturated to 0x7FFF.
/// Negative differences less than 0x8000 are saturated to 0x8000.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPHSUBSW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 128-bit vector of [8 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
/// differences of both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsubs_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
}
/// Horizontally subtracts, with saturation, the adjacent pairs of values
/// contained in two packed 64-bit vectors of [4 x i16].
///
/// Positive differences greater than 0x7FFF are saturated to 0x7FFF.
/// Negative differences less than 0x8000 are saturated to 0x8000.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PHSUBSW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the lower bits of
/// the destination.
/// \param __b
/// A 64-bit vector of [4 x i16] containing one of the source operands. The
/// horizontal differences between the values are stored in the upper bits of
/// the destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
/// differences of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hsubs_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
}
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
/// values contained in the first source operand and packed 8-bit signed
/// integer values contained in the second source operand, adds pairs of
/// contiguous products with signed saturation, and writes the 16-bit sums to
/// the corresponding bits in the destination.
///
/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
/// both operands are multiplied, and the sum of both results is written to
/// bits [15:0] of the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
///
/// \param __a
/// A 128-bit integer vector containing the first source operand.
/// \param __b
/// A 128-bit integer vector containing the second source operand.
/// \returns A 128-bit integer vector containing the sums of products of both
/// operands: \n
/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maddubs_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
}
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
/// values contained in the first source operand and packed 8-bit signed
/// integer values contained in the second source operand, adds pairs of
/// contiguous products with signed saturation, and writes the 16-bit sums to
/// the corresponding bits in the destination.
///
/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
/// both operands are multiplied, and the sum of both results is written to
/// bits [15:0] of the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PMADDUBSW instruction.
///
/// \param __a
/// A 64-bit integer vector containing the first source operand.
/// \param __b
/// A 64-bit integer vector containing the second source operand.
/// \returns A 64-bit integer vector containing the sums of products of both
/// operands: \n
/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_maddubs_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
}
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
/// products to the 18 most significant bits by right-shifting, rounds the
/// truncated value by adding 1, and writes bits [16:1] to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPMULHRSW instruction.
///
/// \param __a
/// A 128-bit vector of [8 x i16] containing one of the source operands.
/// \param __b
/// A 128-bit vector of [8 x i16] containing one of the source operands.
/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
/// products of both operands.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhrs_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
}
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
/// products to the 18 most significant bits by right-shifting, rounds the
/// truncated value by adding 1, and writes bits [16:1] to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PMULHRSW instruction.
///
/// \param __a
/// A 64-bit vector of [4 x i16] containing one of the source operands.
/// \param __b
/// A 64-bit vector of [4 x i16] containing one of the source operands.
/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
/// products of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mulhrs_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
}
/// Copies the 8-bit integers from a 128-bit integer vector to the
/// destination or clears 8-bit values in the destination, as specified by
/// the second source operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPSHUFB instruction.
///
/// \param __a
/// A 128-bit integer vector containing the values to be copied.
/// \param __b
/// A 128-bit integer vector containing control bytes corresponding to
/// positions in the destination:
/// Bit 7: \n
/// 1: Clear the corresponding byte in the destination. \n
/// 0: Copy the selected source byte to the corresponding byte in the
/// destination. \n
/// Bits [6:4] Reserved. \n
/// Bits [3:0] select the source byte to be copied.
/// \returns A 128-bit integer vector containing the copied or cleared values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shuffle_epi8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
}
/// Copies the 8-bit integers from a 64-bit integer vector to the
/// destination or clears 8-bit values in the destination, as specified by
/// the second source operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PSHUFB instruction.
///
/// \param __a
/// A 64-bit integer vector containing the values to be copied.
/// \param __b
/// A 64-bit integer vector containing control bytes corresponding to
/// positions in the destination:
/// Bit 7: \n
/// 1: Clear the corresponding byte in the destination. \n
/// 0: Copy the selected source byte to the corresponding byte in the
/// destination. \n
/// Bits [3:0] select the source byte to be copied.
/// \returns A 64-bit integer vector containing the copied or cleared values.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_shuffle_pi8(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
}
/// For each 8-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the byte in the second source is negative, calculate the two's
/// complement of the corresponding byte in the first source, and write that
/// value to the destination. If the byte in the second source is positive,
/// copy the corresponding byte from the first source to the destination. If
/// the byte in the second source is zero, clear the corresponding byte in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPSIGNB instruction.
///
/// \param __a
/// A 128-bit integer vector containing the values to be copied.
/// \param __b
/// A 128-bit integer vector containing control bytes corresponding to
/// positions in the destination.
/// \returns A 128-bit integer vector containing the resultant values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
}
/// For each 16-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the word in the second source is negative, calculate the two's
/// complement of the corresponding word in the first source, and write that
/// value to the destination. If the word in the second source is positive,
/// copy the corresponding word from the first source to the destination. If
/// the word in the second source is zero, clear the corresponding word in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPSIGNW instruction.
///
/// \param __a
/// A 128-bit integer vector containing the values to be copied.
/// \param __b
/// A 128-bit integer vector containing control words corresponding to
/// positions in the destination.
/// \returns A 128-bit integer vector containing the resultant values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
}
/// For each 32-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the doubleword in the second source is negative, calculate the two's
/// complement of the corresponding word in the first source, and write that
/// value to the destination. If the doubleword in the second source is
/// positive, copy the corresponding word from the first source to the
/// destination. If the doubleword in the second source is zero, clear the
/// corresponding word in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c VPSIGND instruction.
///
/// \param __a
/// A 128-bit integer vector containing the values to be copied.
/// \param __b
/// A 128-bit integer vector containing control doublewords corresponding to
/// positions in the destination.
/// \returns A 128-bit integer vector containing the resultant values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi32(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
}
/// For each 8-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the byte in the second source is negative, calculate the two's
/// complement of the corresponding byte in the first source, and write that
/// value to the destination. If the byte in the second source is positive,
/// copy the corresponding byte from the first source to the destination. If
/// the byte in the second source is zero, clear the corresponding byte in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PSIGNB instruction.
///
/// \param __a
/// A 64-bit integer vector containing the values to be copied.
/// \param __b
/// A 64-bit integer vector containing control bytes corresponding to
/// positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sign_pi8(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
}
/// For each 16-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the word in the second source is negative, calculate the two's
/// complement of the corresponding word in the first source, and write that
/// value to the destination. If the word in the second source is positive,
/// copy the corresponding word from the first source to the destination. If
/// the word in the second source is zero, clear the corresponding word in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PSIGNW instruction.
///
/// \param __a
/// A 64-bit integer vector containing the values to be copied.
/// \param __b
/// A 64-bit integer vector containing control words corresponding to
/// positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sign_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
}
/// For each 32-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the doubleword in the second source is negative, calculate the two's
/// complement of the corresponding doubleword in the first source, and
/// write that value to the destination. If the doubleword in the second
/// source is positive, copy the corresponding doubleword from the first
/// source to the destination. If the doubleword in the second source is
/// zero, clear the corresponding doubleword in the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c PSIGND instruction.
///
/// \param __a
/// A 64-bit integer vector containing the values to be copied.
/// \param __b
/// A 64-bit integer vector containing two control doublewords corresponding
/// to positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sign_pi32(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_MMX
#endif /* __TMMINTRIN_H */

View file

@ -0,0 +1,56 @@
/*===------------- tsxldtrkintrin.h - tsxldtrk intrinsics ------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <tsxldtrkintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __TSXLDTRKINTRIN_H
#define __TSXLDTRKINTRIN_H
/* Define the default attributes for the functions in this file */
#define _DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("tsxldtrk")))
/// Marks the start of an TSX (RTM) suspend load address tracking region. If
/// this intrinsic is used inside a transactional region, subsequent loads
/// are not added to the read set of the transaction. If it's used inside a
/// suspend load address tracking region it will cause transaction abort.
/// If it's used outside of a transactional region it behaves like a NOP.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c XSUSLDTRK instruction.
///
static __inline__ void _DEFAULT_FN_ATTRS
_xsusldtrk (void)
{
__builtin_ia32_xsusldtrk();
}
/// Marks the end of an TSX (RTM) suspend load address tracking region. If this
/// intrinsic is used inside a suspend load address tracking region it will
/// end the suspend region and all following load addresses will be added to
/// the transaction read set. If it's used inside an active transaction but
/// not in a suspend region it will cause transaction abort. If it's used
/// outside of a transactional region it behaves like a NOP.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the \c XRESLDTRK instruction.
///
static __inline__ void _DEFAULT_FN_ATTRS
_xresldtrk (void)
{
__builtin_ia32_xresldtrk();
}
#undef _DEFAULT_FN_ATTRS
#endif /* __TSXLDTRKINTRIN_H */

157
third_party/intel/clang/uintrintrin.h vendored Normal file
View file

@ -0,0 +1,157 @@
/*===------------------ uintrintrin.h - UINTR intrinsics -------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86GPRINTRIN_H
#error "Never use <uintrintrin.h> directly; include <x86gprintrin.h> instead."
#endif
#ifndef __UINTRINTRIN_H
#define __UINTRINTRIN_H
/* Define the default attributes for the functions in this file */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("uintr")))
#ifdef __x86_64__
struct __uintr_frame
{
unsigned long long rip;
unsigned long long rflags;
unsigned long long rsp;
};
/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
/// user interrupt cannot be delivered on the instruction boundary following
/// CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in
/// 64-bit mode, and software is not executing inside an enclave; otherwise,
/// each causes an invalid-opcode exception. Causes a transactional abort if
/// executed inside a transactional region; the abort loads EAX as it would
/// had it been due to an execution of CLI.
///
/// \headerfile <x86gprintrin.h>
///
/// This intrinsic corresponds to the <c> CLUI </c> instruction.
///
/// \code{.operation}
/// UIF := 0
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS
_clui (void)
{
__builtin_ia32_clui();
}
/// Sets the user interrupt flag (UIF). Its effect takes place immediately; a
/// user interrupt may be delivered on the instruction boundary following
/// STUI. Can be executed only if CR4.UINT = 1, the logical processor is in
/// 64-bit mode, and software is not executing inside an enclave; otherwise,
/// each causes an invalid-opcode exception. Causes a transactional abort if
/// executed inside a transactional region; the abort loads EAX as it would
/// had it been due to an execution of STI.
///
/// \headerfile <x86gprintrin.h>
///
/// This intrinsic corresponds to the <c> STUI </c> instruction.
///
/// \code{.operation}
/// UIF := 1
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS
_stui (void)
{
__builtin_ia32_stui();
}
/// Get the current value of the user interrupt flag (UIF). Can be executed
/// regardless of CPL and inside a transactional region. Can be executed only
/// if CR4.UINT = 1, the logical processor is in 64-bit mode, and software is
/// not executing inside an enclave; otherwise, it causes an invalid-opcode
/// exception.
///
/// \headerfile <x86gprintrin.h>
///
/// This intrinsic corresponds to the <c> TESTUI </c> instruction.
///
/// \returns The current value of the user interrupt flag (UIF).
///
/// \code{.operation}
/// CF := UIF
/// ZF := 0
/// AF := 0
/// OF := 0
/// PF := 0
/// SF := 0
/// dst := CF
/// \endcode
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_testui (void)
{
return __builtin_ia32_testui();
}
/// Send interprocessor user interrupt. Can be executed only if
/// CR4.UINT = IA32_UINT_TT[0] = 1, the logical processor is in 64-bit mode,
/// and software is not executing inside an enclave; otherwise, it causes an
/// invalid-opcode exception. May be executed at any privilege level, all of
/// its memory accesses are performed with supervisor privilege.
///
/// \headerfile <x86gprintrin.h>
///
/// This intrinsic corresponds to the <c> SENDUIPI </c> instruction
///
/// \param __a
/// Index of user-interrupt target table entry in user-interrupt target
/// table.
///
/// \code{.operation}
/// IF __a > UITTSZ
/// GP (0)
/// FI
/// tempUITTE := MEM[UITTADDR + (a<<4)]
/// // tempUITTE must be valid, and can't have any reserved bit set
/// IF (tempUITTE.V == 0 OR tempUITTE[7:1] != 0)
/// GP (0)
/// FI
/// tempUPID := MEM[tempUITTE.UPIDADDR] // under lock
/// // tempUPID can't have any reserved bit set
/// IF (tempUPID[15:2] != 0 OR tempUPID[31:24] != 0)
/// GP (0) // release lock
/// FI
/// tempUPID.PIR[tempUITTE.UV] := 1;
/// IF (tempUPID.SN == 0 AND tempUPID.ON == 0)
/// tempUPID.ON := 1
/// sendNotify := 1
/// ELSE
/// sendNotify := 0
/// FI
/// MEM[tempUITTE.UPIDADDR] := tempUPID // release lock
/// IF sendNotify == 1
/// IF IA32_APIC_BASE[10] == 1 // local APIC is in x2APIC mode
/// // send ordinary IPI with vector tempUPID.NV to 32-bit physical APIC
/// // ID tempUPID.NDST
/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST)
/// ELSE
/// // send ordinary IPI with vector tempUPID.NV to 8-bit physical APIC
/// // ID tempUPID.NDST[15:8]
/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
/// FI
/// FI
/// \endcode
static __inline__ void __DEFAULT_FN_ATTRS
_senduipi (unsigned long long __a)
{
__builtin_ia32_senduipi(__a);
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#endif /* __UINTRINTRIN_H */

51
third_party/intel/clang/usermsrintrin.h vendored Normal file
View file

@ -0,0 +1,51 @@
/*===--------------- usermsrintrin.h - USERMSR intrinsics -----------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86GPRINTRIN_H
#error "Never use <usermsrintrin.h> directly; include <x86gprintrin.h> instead."
#endif // __X86GPRINTRIN_H
#ifndef __USERMSRINTRIN_H
#define __USERMSRINTRIN_H
#ifdef __x86_64__
/// Reads the contents of a 64-bit MSR specified in \a __A into \a dst.
///
/// This intrinsic corresponds to the <c> URDMSR </c> instruction.
/// \param __A
/// An unsigned long long.
///
/// \code{.operation}
/// DEST := MSR[__A]
/// \endcode
static __inline__ unsigned long long
__attribute__((__always_inline__, __nodebug__, __target__("usermsr")))
_urdmsr(unsigned long long __A) {
return __builtin_ia32_urdmsr(__A);
}
/// Writes the contents of \a __B into the 64-bit MSR specified in \a __A.
///
/// This intrinsic corresponds to the <c> UWRMSR </c> instruction.
///
/// \param __A
/// An unsigned long long.
/// \param __B
/// An unsigned long long.
///
/// \code{.operation}
/// MSR[__A] := __B
/// \endcode
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("usermsr")))
_uwrmsr(unsigned long long __A, unsigned long long __B) {
return __builtin_ia32_uwrmsr(__A, __B);
}
#endif // __x86_64__
#endif // __USERMSRINTRIN_H

87
third_party/intel/clang/vaesintrin.h vendored Normal file
View file

@ -0,0 +1,87 @@
/*===------------------ vaesintrin.h - VAES intrinsics ---------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <vaesintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __VAESINTRIN_H
#define __VAESINTRIN_H
/* Default attributes for YMM forms. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256)))
/* Default attributes for ZMM forms. */
#define __DEFAULT_FN_ATTRS_F \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512f,evex512,vaes"), \
__min_vector_width__(512)))
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesenc_epi128(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_aesenc256((__v4di) __A,
(__v4di) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesdec_epi128(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_aesdec256((__v4di) __A,
(__v4di) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesenclast_epi128(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A,
(__v4di) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A,
(__v4di) __B);
}
#ifdef __AVX512FINTRIN_H
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesenc_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesdec_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesenclast_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
(__v8di) __B);
}
#endif // __AVX512FINTRIN_H
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_F
#endif // __VAESINTRIN_H

View file

@ -0,0 +1,30 @@
/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __VPCLMULQDQINTRIN_H
#define __VPCLMULQDQINTRIN_H
#define _mm256_clmulepi64_epi128(A, B, I) \
((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), \
(char)(I)))
#ifdef __AVX512FINTRIN_H
#define _mm512_clmulepi64_epi128(A, B, I) \
((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), \
(char)(I)))
#endif // __AVX512FINTRIN_H
#endif /* __VPCLMULQDQINTRIN_H */

42
third_party/intel/clang/waitpkgintrin.h vendored Normal file
View file

@ -0,0 +1,42 @@
/*===----------------------- waitpkgintrin.h - WAITPKG --------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <waitpkgintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __WAITPKGINTRIN_H
#define __WAITPKGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("waitpkg")))
static __inline__ void __DEFAULT_FN_ATTRS
_umonitor (void * __address)
{
__builtin_ia32_umonitor (__address);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_umwait (unsigned int __control, unsigned long long __counter)
{
return __builtin_ia32_umwait (__control,
(unsigned int)(__counter >> 32), (unsigned int)__counter);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_tpause (unsigned int __control, unsigned long long __counter)
{
return __builtin_ia32_tpause (__control,
(unsigned int)(__counter >> 32), (unsigned int)__counter);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __WAITPKGINTRIN_H */

View file

@ -0,0 +1,24 @@
/*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <wbnoinvdintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __WBNOINVDINTRIN_H
#define __WBNOINVDINTRIN_H
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("wbnoinvd")))
_wbnoinvd (void)
{
__builtin_ia32_wbnoinvd ();
}
#endif /* __WBNOINVDINTRIN_H */

23
third_party/intel/clang/wmmintrin.h vendored Normal file
View file

@ -0,0 +1,23 @@
/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __WMMINTRIN_H
#define __WMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include "emmintrin.h"
#include "__wmmintrin_aes.h"
#include "__wmmintrin_pclmul.h"
#endif /* __WMMINTRIN_H */

63
third_party/intel/clang/x86gprintrin.h vendored Normal file
View file

@ -0,0 +1,63 @@
/*===--------------- x86gprintrin.h - X86 GPR intrinsics ------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86GPRINTRIN_H
#define __X86GPRINTRIN_H
#if !defined(__SCE__) || __has_feature(modules) || defined(__HRESET__)
#include "hresetintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__UINTR__)
#include "uintrintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__USERMSR__)
#include "usermsrintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__CRC32__)
#include "crc32intrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHI__)
#include "prfchiintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__RAOINT__)
#include "raointintrin.h"
#endif
#if !defined(__SCE__) || __has_feature(modules) || defined(__CMPCCXADD__)
#include "cmpccxaddintrin.h"
#endif
#if defined(__i386__)
#define __SAVE_GPRBX "mov {%%ebx, %%eax |eax, ebx};"
#define __RESTORE_GPRBX "mov {%%eax, %%ebx |ebx, eax};"
#define __TMPGPR "eax"
#else
// When in 64-bit target, the 32-bit operands generate a 32-bit result,
// zero-extended to a 64-bit result in the destination general-purpose,
// It means "mov x %ebx" will clobber the higher 32 bits of rbx, so we
// should preserve the 64-bit register rbx.
#define __SAVE_GPRBX "mov {%%rbx, %%rax |rax, rbx};"
#define __RESTORE_GPRBX "mov {%%rax, %%rbx |rbx, rax};"
#define __TMPGPR "rax"
#endif
#define __SSC_MARK(__Tag) \
__asm__ __volatile__( __SAVE_GPRBX \
"mov {%0, %%ebx|ebx, %0}; " \
".byte 0x64, 0x67, 0x90; " \
__RESTORE_GPRBX \
::"i"(__Tag) \
: __TMPGPR );
#endif /* __X86GPRINTRIN_H */

Some files were not shown because too many files have changed in this diff Show more