Release Cosmopolitan v3.8.0

This change switches c++ exception handling from sjlj to standard dwarf.
It's needed because clang for aarch64 doesn't support sjlj. It turns out
that libunwind had a bare-metal configuration that made this easy to do.

This change gets the new experimental cosmocc -mclang flag in a state of
working so well that it can now be used to build all of llamafile and it
goes 3x faster in terms of build latency, without trading away any perf.

The int_fast16_t and int_fast32_t types are now always defined as 32-bit
in the interest of having more abi consistency between cosmocc -mgcc and
-mclang mode.
This commit is contained in:
Justine Tunney 2024-08-30 20:12:26 -07:00
parent 5b9862907c
commit c9152b6f14
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
188 changed files with 199063 additions and 636 deletions

View file

@ -1,5 +1,5 @@
{
"C_Cpp.default.compilerPath": ".cosmocc/3.7.1/bin/aarch64-linux-cosmo-c++",
"C_Cpp.default.compilerPath": ".cosmocc/3.8.0/bin/aarch64-linux-cosmo-c++",
"C_Cpp.default.compilerArgs": [
"-nostdinc",
"-nostdlib",

View file

@ -147,10 +147,10 @@ export MODE
export SOURCE_DATE_EPOCH
export TMPDIR
COSMOCC = .cosmocc/3.7.1
COSMOCC = .cosmocc/3.8.0
BOOTSTRAP = $(COSMOCC)/bin
TOOLCHAIN = $(COSMOCC)/bin/$(ARCH)-linux-cosmo-
DOWNLOAD := $(shell build/download-cosmocc.sh $(COSMOCC) 3.7.1 13b65b0e659b493bd82f3d0a319d0265d66f849839e484aa2a54191024711e85)
DOWNLOAD := $(shell build/download-cosmocc.sh $(COSMOCC) 3.8.0 813c6b2f95062d2e0a845307a79505424cb98cb038e8013334f8a22e3b92a474)
IGNORE := $(shell $(MKDIR) $(TMPDIR))

View file

@ -103,10 +103,8 @@ SECTIONS {
*(.eh_frame_entry .eh_frame_entry.*)
}
.eh_frame : ONLY_IF_RO {
KEEP(*(.eh_frame))
*(.eh_frame.*)
}
__eh_frame_hdr_start = SIZEOF(.eh_frame_hdr) > 0 ? ADDR(.eh_frame_hdr) : 0;
__eh_frame_hdr_end = SIZEOF(.eh_frame_hdr) > 0 ? . : 0;
.gcc_except_table : ONLY_IF_RO {
*(.gcc_except_table .gcc_except_table.*)
@ -127,9 +125,11 @@ SECTIONS {
. += CONSTANT(MAXPAGESIZE);
. = DATA_SEGMENT_ALIGN(CONSTANT(MAXPAGESIZE), CONSTANT(COMMONPAGESIZE));
.eh_frame : ONLY_IF_RW {
.eh_frame : {
__eh_frame_start = .;
KEEP(*(.eh_frame))
*(.eh_frame.*)
__eh_frame_end = .;
}
.gnu_extab : ONLY_IF_RW {

View file

@ -329,6 +329,10 @@ SECTIONS {
*(.ubsan.types)
*(.ubsan.data)
__eh_frame_hdr_start_actual = .;
*(.eh_frame_hdr)
__eh_frame_hdr_end_actual = .;
/* Legal Notices */
__notices = .;
KEEP(*(.notice))
@ -422,6 +426,11 @@ SECTIONS {
KEEP(*(.dtors))
__fini_array_end = .;
__eh_frame_start = .;
KEEP(*(.eh_frame))
*(.eh_frame.*)
__eh_frame_end = .;
/*BEGIN: Post-Initialization Read-Only */
. = ALIGN(. != 0 ? __SIZEOF_POINTER__ : 0);
KEEP(*(SORT_BY_NAME(.piro.relo.sort.*)))
@ -601,6 +610,9 @@ ape_text_memsz = ape_text_filesz;
ape_text_align = CONSTANT(COMMONPAGESIZE);
ape_text_rva = RVA(ape_text_vaddr);
__eh_frame_hdr_start = __eh_frame_hdr_end_actual > __eh_frame_hdr_start_actual ? __eh_frame_hdr_start_actual : 0;
__eh_frame_hdr_end = __eh_frame_hdr_end_actual > __eh_frame_hdr_start_actual ? __eh_frame_hdr_end_actual : 0;
/* we roundup here because xnu wants the file load segments page-aligned */
/* but we don't want to add the nop padding to the ape program, so we'll */
/* let ape.S dd read past the end of the file into the wrapping binaries */

View file

@ -92,10 +92,7 @@ DEFAULT_COPTS ?= \
-fno-gnu-unique \
-fstrict-aliasing \
-fstrict-overflow \
-fno-semantic-interposition \
-fno-dwarf2-cfi-asm \
-fno-unwind-tables \
-fno-asynchronous-unwind-tables
-fno-semantic-interposition
ifeq ($(ARCH), x86_64)
# Microsoft says "[a]ny memory below the stack beyond the red zone
@ -139,8 +136,6 @@ DEFAULT_CFLAGS = \
DEFAULT_CXXFLAGS = \
-std=gnu++23 \
-fno-rtti \
-fno-exceptions \
-fuse-cxa-atexit \
-Wno-int-in-bool-context \
-Wno-narrowing \

View file

@ -6,14 +6,14 @@ if [ -n "$OBJDUMP" ]; then
fi
find_objdump() {
if [ -x .cosmocc/3.6.0/bin/$1-linux-cosmo-objdump ]; then
OBJDUMP=.cosmocc/3.6.0/bin/$1-linux-cosmo-objdump
elif [ -x .cosmocc/3.6.0/bin/$1-linux-musl-objdump ]; then
OBJDUMP=.cosmocc/3.6.0/bin/$1-linux-musl-objdump
elif [ -x "$COSMO/.cosmocc/3.6.0/bin/$1-linux-cosmo-objdump" ]; then
OBJDUMP="$COSMO/.cosmocc/3.6.0/bin/$1-linux-cosmo-objdump"
elif [ -x "$COSMO/.cosmocc/3.6.0/bin/$1-linux-musl-objdump" ]; then
OBJDUMP="$COSMO/.cosmocc/3.6.0/bin/$1-linux-musl-objdump"
if [ -x .cosmocc/3.8.0/bin/$1-linux-cosmo-objdump ]; then
OBJDUMP=.cosmocc/3.8.0/bin/$1-linux-cosmo-objdump
elif [ -x .cosmocc/3.8.0/bin/$1-linux-musl-objdump ]; then
OBJDUMP=.cosmocc/3.8.0/bin/$1-linux-musl-objdump
elif [ -x "$COSMO/.cosmocc/3.8.0/bin/$1-linux-cosmo-objdump" ]; then
OBJDUMP="$COSMO/.cosmocc/3.8.0/bin/$1-linux-cosmo-objdump"
elif [ -x "$COSMO/.cosmocc/3.8.0/bin/$1-linux-musl-objdump" ]; then
OBJDUMP="$COSMO/.cosmocc/3.8.0/bin/$1-linux-musl-objdump"
else
echo "error: toolchain not found (try running 'cosmocc --update' or 'make' in the cosmo monorepo)" >&2
exit 1

View file

@ -241,8 +241,9 @@ class set
private:
friend class set;
node_type* node_;
node_type* root_;
explicit reverse_iterator(node_type* node) : node_(node)
explicit reverse_iterator(node_type* node, node_type* root) : node_(node), root_(root)
{
}
};
@ -347,17 +348,17 @@ class set
reverse_iterator rbegin()
{
return reverse_iterator(rightmost(root_));
return reverse_iterator(rightmost(root_), root_);
}
const_reverse_iterator rbegin() const
{
return const_reverse_iterator(rightmost(root_));
return const_reverse_iterator(rightmost(root_), root_);
}
const_reverse_iterator crbegin() const
{
return const_reverse_iterator(rightmost(root_));
return const_reverse_iterator(rightmost(root_), root_);
}
iterator end() noexcept
@ -377,17 +378,17 @@ class set
reverse_iterator rend()
{
return reverse_iterator(nullptr);
return reverse_iterator(nullptr, root_);
}
const_reverse_iterator rend() const
{
return const_reverse_iterator(nullptr);
return const_reverse_iterator(nullptr, root_);
}
const_reverse_iterator crend() const
{
return const_reverse_iterator(nullptr);
return const_reverse_iterator(nullptr, root_);
}
void clear() noexcept

View file

@ -94,6 +94,7 @@ EXAMPLES_DIRECTDEPS = \
THIRD_PARTY_VQSORT \
THIRD_PARTY_XED \
THIRD_PARTY_LIBCXXABI \
THIRD_PARTY_LIBUNWIND \
THIRD_PARTY_ZLIB \
TOOL_ARGS \
TOOL_BUILD_LIB \

View file

@ -3,8 +3,8 @@
#endif
#define __COSMOPOLITAN_MAJOR__ 3
#define __COSMOPOLITAN_MINOR__ 7
#define __COSMOPOLITAN_PATCH__ 1
#define __COSMOPOLITAN_MINOR__ 8
#define __COSMOPOLITAN_PATCH__ 0
#define __COSMOPOLITAN__ \
(100000000 * __COSMOPOLITAN_MAJOR__ + 1000000 * __COSMOPOLITAN_MINOR__ + \
__COSMOPOLITAN_PATCH__)
@ -93,6 +93,30 @@
#include "libc/integral/llp64.inc"
#endif
#undef __INT_FAST16_MAX__
#undef __INT_FAST16_TYPE__
#undef __UINT_FAST16_MAX__
#undef __INT_FAST16_WIDTH__
#undef __UINT_FAST16_TYPE__
#define __INT_FAST16_MAX__ 2147483647
#define __INT_FAST16_TYPE__ int
#define __UINT_FAST16_MAX__ 4294967295U
#define __INT_FAST16_WIDTH__ 32
#define __UINT_FAST16_TYPE__ unsigned int
#undef __INT_FAST32_MAX__
#undef __INT_FAST32_TYPE__
#undef __UINT_FAST32_MAX__
#undef __INT_FAST32_WIDTH__
#undef __UINT_FAST32_TYPE__
#define __INT_FAST32_MAX__ 2147483647
#define __INT_FAST32_TYPE__ int
#define __UINT_FAST32_MAX__ 4294967295U
#define __INT_FAST32_WIDTH__ 32
#define __UINT_FAST32_TYPE__ unsigned int
#if !(__ASSEMBLER__ + __LINKER__ + 0)
#ifdef __STDC__
#include "libc/integral/c.inc"

22
libc/intrin/personality.c Normal file
View file

@ -0,0 +1,22 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2024 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
__attribute__((__weak__)) void __gxx_personality_v0() {
__builtin_trap();
}

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/ammintrin.h"
#else
#include "third_party/intel/ammintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_ */

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/amxcomplexintrin.h"
#else
#include "third_party/intel/amxcomplexintrin.internal.h"
#endif

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/amxfp16intrin.h"
#else
#include "third_party/intel/amxfp16intrin.internal.h"
#endif

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_
#ifdef __clang__
#include "third_party/aarch64/clang/arm_acle.h"
#else
#include "third_party/aarch64/arm_acle.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_
#ifdef __clang__
#include "third_party/aarch64/clang/arm_bf16.h"
#else
#include "third_party/aarch64/arm_bf16.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_
#ifdef __clang__
#include "third_party/aarch64/clang/arm_fp16.h"
#else
#include "third_party/aarch64/arm_fp16.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_
#ifdef __clang__
#include "third_party/aarch64/clang/arm_neon.h"
#else
#include "third_party/aarch64/arm_neon.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_ */

8
libc/isystem/arm_sve.h Normal file
View file

@ -0,0 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_
#ifdef __clang__
#include "third_party/aarch64/clang/arm_sve.h"
#else
#include "third_party/aarch64/arm_sve.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_ */

View file

@ -0,0 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_
#ifdef __clang__
#include "third_party/aarch64/clang/arm_vector_types.h"
#else
#include "third_party/aarch64/arm_vector_types.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_ */

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/avxifmaintrin.h"
#else
#include "third_party/intel/avxifmaintrin.internal.h"
#endif

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/avxneconvertintrin.h"
#else
#include "third_party/intel/avxneconvertintrin.internal.h"
#endif

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/avxvnniint16intrin.h"
#else
#include "third_party/intel/avxvnniint16intrin.internal.h"
#endif

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/avxvnniint8intrin.h"
#else
#include "third_party/intel/avxvnniint8intrin.internal.h"
#endif

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/clzerointrin.h"
#else
#include "third_party/intel/clzerointrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_ */

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/cmpccxaddintrin.h"
#else
#include "third_party/intel/cmpccxaddintrin.internal.h"
#endif

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/emmintrin.h"
#else
#include "third_party/intel/emmintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/immintrin.h"
#else
#include "third_party/intel/immintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/mm_malloc.h"
#else
#include "third_party/intel/mm_malloc.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/mmintrin.h"
#else
#include "third_party/intel/mmintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/mwaitxintrin.h"
#else
#include "third_party/intel/mwaitxintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/nmmintrin.h"
#else
#include "third_party/intel/nmmintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/pmmintrin.h"
#else
#include "third_party/intel/pmmintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/popcntintrin.h"
#else
#include "third_party/intel/popcntintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_ */

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/prfchiintrin.h"
#else
#include "third_party/intel/prfchiintrin.internal.h"
#endif

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/raointintrin.h"
#else
#include "third_party/intel/raointintrin.internal.h"
#endif

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/sgxintrin.h"
#else
#include "third_party/intel/sgxintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_ */

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/sha512intrin.h"
#else
#include "third_party/intel/sha512intrin.internal.h"
#endif

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/sm3intrin.h"
#else
#include "third_party/intel/sm3intrin.internal.h"
#endif

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/sm4intrin.h"
#else
#include "third_party/intel/sm4intrin.internal.h"
#endif

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/smmintrin.h"
#else
#include "third_party/intel/smmintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/tmmintrin.h"
#else
#include "third_party/intel/tmmintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_ */

View file

@ -1 +1,5 @@
#ifdef __clang__
#include "third_party/intel/clang/usermsrintrin.h"
#else
#include "third_party/intel/usermsrintrin.internal.h"
#endif

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/wmmintrin.h"
#else
#include "third_party/intel/wmmintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/x86intrin.h"
#else
#include "third_party/intel/x86intrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,8 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
#ifdef __clang__
#include "third_party/intel/clang/xmmintrin.h"
#else
#include "third_party/intel/xmmintrin.internal.h"
#endif
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_ */

View file

@ -42,7 +42,8 @@ $(LIBC_MEM_A_OBJS): private \
COPTS += \
-fno-sanitize=all \
-Wframe-larger-than=4096 \
-Walloca-larger-than=4096
-Walloca-larger-than=4096 \
-fexceptions
o/$(MODE)/libc/mem/asan.o: private \
CFLAGS += \

View file

@ -7,10 +7,10 @@ void *bsearch(const void *, const void *, size_t, size_t,
void *bsearch_r(const void *, const void *, size_t, size_t,
int (*)(const void *, const void *, void *), void *)
paramsnonnull((1, 2, 5)) nosideeffect;
void qsort3(void *, size_t, size_t,
int (*)(const void *, const void *)) libcesque paramsnonnull();
void qsort(void *, size_t, size_t,
int (*)(const void *, const void *)) libcesque paramsnonnull();
void qsort3(void *, size_t, size_t, int (*)(const void *, const void *))
paramsnonnull();
void qsort(void *, size_t, size_t, int (*)(const void *, const void *))
paramsnonnull();
void qsort_r(void *, size_t, size_t,
int (*)(const void *, const void *, void *), void *)
paramsnonnull((1, 4));

View file

@ -48,6 +48,8 @@ TEST_LIBC_TINYMATH_DIRECTDEPS = \
THIRD_PARTY_DOUBLECONVERSION \
THIRD_PARTY_GDTOA \
THIRD_PARTY_LIBCXX \
THIRD_PARTY_LIBCXXABI \
THIRD_PARTY_LIBUNWIND \
TEST_LIBC_TINYMATH_DEPS := \
$(call uniq,$(foreach x,$(TEST_LIBC_TINYMATH_DIRECTDEPS),$($(x))))

View file

@ -21,13 +21,6 @@
#include "libc/mem/mem.h"
#include "libc/runtime/runtime.h"
// this dontthrow keyword SHOULD break this test. it's probably passing
// because we're currently using SjLj exceptions. the day we can change
// things, remove `dontthrow` and this test will still be a useful help
extern "C" dontthrow void qsort_(void *, size_t, size_t,
int (*)(const void *,
const void *)) asm("qsort");
struct Resource {
char *p;
Resource() {
@ -60,7 +53,7 @@ int A[3] = {3, 2, 1};
int Work(void) {
Resource r;
pPoke(r.p);
qsort_(A, 3, sizeof(int), cmp);
qsort(A, 3, sizeof(int), cmp);
return A[0];
}
int (*pWork)(void) = Work;

View file

@ -3,4 +3,4 @@
PKGS += THIRD_PARTY_AARCH64
THIRD_PARTY_AARCH64_HDRS = $(filter %.h,$(THIRD_PARTY_AARCH64_FILES))
THIRD_PARTY_AARCH64_FILES := $(wildcard third_party/aarch64/*)
THIRD_PARTY_AARCH64_FILES := $(wildcard third_party/aarch64/*) $(wildcard third_party/aarch64/clang/*)

35
third_party/aarch64/clang/arm64intr.h vendored Normal file
View file

@ -0,0 +1,35 @@
/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
/* Only include this if we're compiling for the windows platform. */
#ifndef _MSC_VER
#include_next <arm64intr.h>
#else
#ifndef __ARM64INTR_H
#define __ARM64INTR_H
typedef enum
{
_ARM64_BARRIER_SY = 0xF,
_ARM64_BARRIER_ST = 0xE,
_ARM64_BARRIER_LD = 0xD,
_ARM64_BARRIER_ISH = 0xB,
_ARM64_BARRIER_ISHST = 0xA,
_ARM64_BARRIER_ISHLD = 0x9,
_ARM64_BARRIER_NSH = 0x7,
_ARM64_BARRIER_NSHST = 0x6,
_ARM64_BARRIER_NSHLD = 0x5,
_ARM64_BARRIER_OSH = 0x3,
_ARM64_BARRIER_OSHST = 0x2,
_ARM64_BARRIER_OSHLD = 0x1
} _ARM64INTR_BARRIER_TYPE;
#endif /* __ARM64INTR_H */
#endif /* _MSC_VER */

888
third_party/aarch64/clang/arm_acle.h vendored Normal file
View file

@ -0,0 +1,888 @@
/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* The Arm C Language Extensions specifications can be found in the following
* link: https://github.com/ARM-software/acle/releases
*
* The ACLE section numbers are subject to change. When consulting the
* specifications, it is recommended to search using section titles if
* the section numbers look outdated.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __ARM_ACLE_H
#define __ARM_ACLE_H
#ifndef __ARM_ACLE
#error "ACLE intrinsics support not enabled."
#endif
#include <stdint.h>
#if defined(__cplusplus)
extern "C" {
#endif
/* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
/* 7.3 Memory barriers */
#if !__has_builtin(__dmb)
#define __dmb(i) __builtin_arm_dmb(i)
#endif
#if !__has_builtin(__dsb)
#define __dsb(i) __builtin_arm_dsb(i)
#endif
#if !__has_builtin(__isb)
#define __isb(i) __builtin_arm_isb(i)
#endif
/* 7.4 Hints */
#if !__has_builtin(__wfi)
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
__builtin_arm_wfi();
}
#endif
#if !__has_builtin(__wfe)
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
__builtin_arm_wfe();
}
#endif
#if !__has_builtin(__sev)
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
__builtin_arm_sev();
}
#endif
#if !__has_builtin(__sevl)
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
__builtin_arm_sevl();
}
#endif
#if !__has_builtin(__yield)
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
__builtin_arm_yield();
}
#endif
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
#define __dbg(t) __builtin_arm_dbg(t)
#endif
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
#define _CHKFEAT_GCS 1
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__chkfeat(uint64_t __features) {
return __builtin_arm_chkfeat(__features) ^ __features;
}
#endif
/* 7.5 Swap */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__swp(uint32_t __x, volatile uint32_t *__p) {
uint32_t v;
do
v = __builtin_arm_ldrex(__p);
while (__builtin_arm_strex(__x, __p));
return v;
}
/* 7.6 Memory prefetch intrinsics */
/* 7.6.1 Data prefetch */
#define __pld(addr) __pldx(0, 0, 0, addr)
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
#define __pldx(access_kind, cache_level, retention_policy, addr) \
__builtin_arm_prefetch(addr, access_kind, 1)
#else
#define __pldx(access_kind, cache_level, retention_policy, addr) \
__builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
#endif
/* 7.6.2 Instruction prefetch */
#define __pli(addr) __plix(0, 0, addr)
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
#define __plix(cache_level, retention_policy, addr) \
__builtin_arm_prefetch(addr, 0, 0)
#else
#define __plix(cache_level, retention_policy, addr) \
__builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
#endif
/* 7.7 NOP */
#if !defined(_MSC_VER) || (!defined(__aarch64__) && !defined(__arm64ec__))
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
__builtin_arm_nop();
}
#endif
/* 8 DATA-PROCESSING INTRINSICS */
/* 8.2 Miscellaneous data-processing intrinsics */
/* ROR */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__ror(uint32_t __x, uint32_t __y) {
__y %= 32;
if (__y == 0)
return __x;
return (__x >> __y) | (__x << (32 - __y));
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rorll(uint64_t __x, uint32_t __y) {
__y %= 64;
if (__y == 0)
return __x;
return (__x >> __y) | (__x << (64 - __y));
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__rorl(unsigned long __x, uint32_t __y) {
#if __SIZEOF_LONG__ == 4
return __ror(__x, __y);
#else
return __rorll(__x, __y);
#endif
}
/* CLZ */
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
__clz(uint32_t __t) {
return __builtin_arm_clz(__t);
}
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
__clzl(unsigned long __t) {
#if __SIZEOF_LONG__ == 4
return __builtin_arm_clz(__t);
#else
return __builtin_arm_clz64(__t);
#endif
}
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
__clzll(uint64_t __t) {
return __builtin_arm_clz64(__t);
}
/* CLS */
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
__cls(uint32_t __t) {
return __builtin_arm_cls(__t);
}
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
__clsl(unsigned long __t) {
#if __SIZEOF_LONG__ == 4
return __builtin_arm_cls(__t);
#else
return __builtin_arm_cls64(__t);
#endif
}
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
__clsll(uint64_t __t) {
return __builtin_arm_cls64(__t);
}
/* REV */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__rev(uint32_t __t) {
return __builtin_bswap32(__t);
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__revl(unsigned long __t) {
#if __SIZEOF_LONG__ == 4
return __builtin_bswap32(__t);
#else
return __builtin_bswap64(__t);
#endif
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__revll(uint64_t __t) {
return __builtin_bswap64(__t);
}
/* REV16 */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__rev16(uint32_t __t) {
return __ror(__rev(__t), 16);
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rev16ll(uint64_t __t) {
return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__rev16l(unsigned long __t) {
#if __SIZEOF_LONG__ == 4
return __rev16(__t);
#else
return __rev16ll(__t);
#endif
}
/* REVSH */
static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
__revsh(int16_t __t) {
return (int16_t)__builtin_bswap16((uint16_t)__t);
}
/* RBIT */
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__rbit(uint32_t __t) {
return __builtin_arm_rbit(__t);
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
__rbitll(uint64_t __t) {
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
__builtin_arm_rbit(__t >> 32);
#else
return __builtin_arm_rbit64(__t);
#endif
}
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
__rbitl(unsigned long __t) {
#if __SIZEOF_LONG__ == 4
return __rbit(__t);
#else
return __rbitll(__t);
#endif
}
/* 8.3 16-bit multiplications */
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulbb(int32_t __a, int32_t __b) {
return __builtin_arm_smulbb(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulbt(int32_t __a, int32_t __b) {
return __builtin_arm_smulbt(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smultb(int32_t __a, int32_t __b) {
return __builtin_arm_smultb(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smultt(int32_t __a, int32_t __b) {
return __builtin_arm_smultt(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulwb(int32_t __a, int32_t __b) {
return __builtin_arm_smulwb(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
__smulwt(int32_t __a, int32_t __b) {
return __builtin_arm_smulwt(__a, __b);
}
#endif
/*
* 8.4 Saturating intrinsics
*
* FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
* intrinsics are implemented and the flag is enabled.
*/
/* 8.4.1 Width-specified saturation intrinsics */
#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
#define __ssat(x, y) __builtin_arm_ssat(x, y)
#define __usat(x, y) __builtin_arm_usat(x, y)
#endif
/* 8.4.2 Saturating addition and subtraction intrinsics */
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qadd(int32_t __t, int32_t __v) {
return __builtin_arm_qadd(__t, __v);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qsub(int32_t __t, int32_t __v) {
return __builtin_arm_qsub(__t, __v);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__qdbl(int32_t __t) {
return __builtin_arm_qadd(__t, __t);
}
#endif
/* 8.4.3 Accumulating multiplications */
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlabb(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlabb(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlabt(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlabt(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlatb(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlatb(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlatt(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlatt(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlawb(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlawb(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlawt(int32_t __a, int32_t __b, int32_t __c) {
return __builtin_arm_smlawt(__a, __b, __c);
}
#endif
/* 8.5.4 Parallel 16-bit saturation */
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
#define __usat16(x, y) __builtin_arm_usat16(x, y)
#endif
/* 8.5.5 Packing and unpacking */
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
typedef int32_t int8x4_t;
typedef int32_t int16x2_t;
typedef uint32_t uint8x4_t;
typedef uint32_t uint16x2_t;
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sxtab16(int16x2_t __a, int8x4_t __b) {
return __builtin_arm_sxtab16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sxtb16(int8x4_t __a) {
return __builtin_arm_sxtb16(__a);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__uxtab16(int16x2_t __a, int8x4_t __b) {
return __builtin_arm_uxtab16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__uxtb16(int8x4_t __a) {
return __builtin_arm_uxtb16(__a);
}
#endif
/* 8.5.6 Parallel selection */
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__sel(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_sel(__a, __b);
}
#endif
/* 8.5.7 Parallel 8-bit addition and subtraction */
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__qadd8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_qadd8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__qsub8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_qsub8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__sadd8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_sadd8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__shadd8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_shadd8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__shsub8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_shsub8(__a, __b);
}
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
__ssub8(int8x4_t __a, int8x4_t __b) {
return __builtin_arm_ssub8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uadd8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uadd8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uhadd8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uhadd8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uhsub8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uhsub8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uqadd8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uqadd8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__uqsub8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_uqsub8(__a, __b);
}
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
__usub8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_usub8(__a, __b);
}
#endif
/* 8.5.8 Sum of 8-bit absolute differences */
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__usad8(uint8x4_t __a, uint8x4_t __b) {
return __builtin_arm_usad8(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
return __builtin_arm_usada8(__a, __b, __c);
}
#endif
/* 8.5.9 Parallel 16-bit addition and subtraction */
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qadd16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qadd16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qasx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qasx(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qsax(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qsax(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__qsub16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_qsub16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sadd16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_sadd16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__sasx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_sasx(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shadd16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shadd16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shasx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shasx(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shsax(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shsax(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__shsub16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_shsub16(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__ssax(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_ssax(__a, __b);
}
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
__ssub16(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_ssub16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uadd16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uadd16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uasx(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uasx(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhadd16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhadd16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhasx(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhasx(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhsax(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhsax(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uhsub16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uhsub16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqadd16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqadd16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqasx(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqasx(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqsax(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqsax(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__uqsub16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_uqsub16(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__usax(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_usax(__a, __b);
}
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
__usub16(uint16x2_t __a, uint16x2_t __b) {
return __builtin_arm_usub16(__a, __b);
}
#endif
/* 8.5.10 Parallel 16-bit multiplication */
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smlad(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smladx(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlald(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlaldx(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smlsd(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
return __builtin_arm_smlsdx(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlsld(__a, __b, __c);
}
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
return __builtin_arm_smlsldx(__a, __b, __c);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smuad(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smuad(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smuadx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smuadx(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smusd(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smusd(__a, __b);
}
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
__smusdx(int16x2_t __a, int16x2_t __b) {
return __builtin_arm_smusdx(__a, __b);
}
#endif
/* 8.6 Floating-point data-processing intrinsics */
#if (defined(__ARM_FEATURE_DIRECTED_ROUNDING) && \
(__ARM_FEATURE_DIRECTED_ROUNDING)) && \
(defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
static __inline__ double __attribute__((__always_inline__, __nodebug__))
__rintn(double __a) {
return __builtin_roundeven(__a);
}
static __inline__ float __attribute__((__always_inline__, __nodebug__))
__rintnf(float __a) {
return __builtin_roundevenf(__a);
}
#endif
/* 8.8 CRC32 intrinsics */
#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) || \
(defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32b(uint32_t __a, uint8_t __b) {
return __builtin_arm_crc32b(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32h(uint32_t __a, uint16_t __b) {
return __builtin_arm_crc32h(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32w(uint32_t __a, uint32_t __b) {
return __builtin_arm_crc32w(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32d(uint32_t __a, uint64_t __b) {
return __builtin_arm_crc32d(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32cb(uint32_t __a, uint8_t __b) {
return __builtin_arm_crc32cb(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32ch(uint32_t __a, uint16_t __b) {
return __builtin_arm_crc32ch(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32cw(uint32_t __a, uint32_t __b) {
return __builtin_arm_crc32cw(__a, __b);
}
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
__crc32cd(uint32_t __a, uint64_t __b) {
return __builtin_arm_crc32cd(__a, __b);
}
#endif
/* 8.6 Floating-point data-processing intrinsics */
/* Armv8.3-A Javascript conversion intrinsic */
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
__jcvt(double __a) {
return __builtin_arm_jcvt(__a);
}
#endif
/* Armv8.5-A FP rounding intrinsics */
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint32zf(float __a) {
return __builtin_arm_rint32zf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint32z(double __a) {
return __builtin_arm_rint32z(__a);
}
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint64zf(float __a) {
return __builtin_arm_rint64zf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint64z(double __a) {
return __builtin_arm_rint64z(__a);
}
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint32xf(float __a) {
return __builtin_arm_rint32xf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint32x(double __a) {
return __builtin_arm_rint32x(__a);
}
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint64xf(float __a) {
return __builtin_arm_rint64xf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
__rint64x(double __a) {
return __builtin_arm_rint64x(__a);
}
#endif
/* 8.9 Armv8.7-A load/store 64-byte intrinsics */
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
typedef struct {
uint64_t val[8];
} data512_t;
static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
__arm_ld64b(const void *__addr) {
data512_t __value;
__builtin_arm_ld64b(__addr, __value.val);
return __value;
}
static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
__arm_st64b(void *__addr, data512_t __value) {
__builtin_arm_st64b(__addr, __value.val);
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
__arm_st64bv(void *__addr, data512_t __value) {
return __builtin_arm_st64bv(__addr, __value.val);
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
__arm_st64bv0(void *__addr, data512_t __value) {
return __builtin_arm_st64bv0(__addr, __value.val);
}
#endif
/* 11.1 Special register intrinsics */
#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
#define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
#define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
#define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
#define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
/* 10.3 MTE intrinsics */
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
#define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask)
#define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset)
#define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded)
#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
/* 18 memcpy family of operations intrinsics - MOPS */
#define __arm_mops_memset_tag(__tagged_address, __value, __size) \
__builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
#endif
/* 11.3 Coprocessor Intrinsics */
#if defined(__ARM_FEATURE_COPROC)
#if (__ARM_FEATURE_COPROC & 0x1)
#if (__ARM_ARCH < 8)
#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \
__builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
#endif /* __ARM_ARCH < 8 */
#define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p)
#define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p)
#define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2) \
__builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2)
#define __arm_mrc(coproc, opc1, CRn, CRm, opc2) \
__builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2)
#if (__ARM_ARCH != 4) && (__ARM_ARCH < 8)
#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
#endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */
#if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__)
#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \
__builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
#endif /* ___ARM_ARCH_8M_MAIN__ */
#endif /* __ARM_FEATURE_COPROC & 0x1 */
#if (__ARM_FEATURE_COPROC & 0x2)
#define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2) \
__builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)
#define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p)
#define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p)
#define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p)
#define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p)
#define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2) \
__builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)
#define __arm_mrc2(coproc, opc1, CRn, CRm, opc2) \
__builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2)
#endif
#if (__ARM_FEATURE_COPROC & 0x4)
#define __arm_mcrr(coproc, opc1, value, CRm) \
__builtin_arm_mcrr(coproc, opc1, value, CRm)
#define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm)
#endif
#if (__ARM_FEATURE_COPROC & 0x8)
#define __arm_mcrr2(coproc, opc1, value, CRm) \
__builtin_arm_mcrr2(coproc, opc1, value, CRm)
#define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm)
#endif
#endif // __ARM_FEATURE_COPROC
/* 17 Transactional Memory Extension (TME) Intrinsics */
#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME
#define _TMFAILURE_REASON 0x00007fffu
#define _TMFAILURE_RTRY 0x00008000u
#define _TMFAILURE_CNCL 0x00010000u
#define _TMFAILURE_MEM 0x00020000u
#define _TMFAILURE_IMP 0x00040000u
#define _TMFAILURE_ERR 0x00080000u
#define _TMFAILURE_SIZE 0x00100000u
#define _TMFAILURE_NEST 0x00200000u
#define _TMFAILURE_DBG 0x00400000u
#define _TMFAILURE_INT 0x00800000u
#define _TMFAILURE_TRIVIAL 0x01000000u
#define __tstart() __builtin_arm_tstart()
#define __tcommit() __builtin_arm_tcommit()
#define __tcancel(__arg) __builtin_arm_tcancel(__arg)
#define __ttest() __builtin_arm_ttest()
#endif /* __ARM_FEATURE_TME */
/* 8.7 Armv8.5-A Random number generation intrinsics */
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
__rndr(uint64_t *__p) {
return __builtin_arm_rndr(__p);
}
static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
__rndrrs(uint64_t *__p) {
return __builtin_arm_rndrrs(__p);
}
#endif
/* 11.2 Guarded Control Stack intrinsics */
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
static __inline__ void * __attribute__((__always_inline__, __nodebug__))
__gcspr() {
return (void *)__builtin_arm_rsr64("gcspr_el0");
}
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("gcs")))
__gcspopm() {
return __builtin_arm_gcspopm(0);
}
static __inline__ const void * __attribute__((__always_inline__, __nodebug__, target("gcs")))
__gcsss(const void *__stack) {
return __builtin_arm_gcsss(__stack);
}
#endif
#if defined(__cplusplus)
}
#endif
#endif /* __ARM_ACLE_H */

20
third_party/aarch64/clang/arm_bf16.h vendored Normal file
View file

@ -0,0 +1,20 @@
/*===---- arm_bf16.h - ARM BF16 intrinsics -----------------------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __ARM_BF16_H
#define __ARM_BF16_H
typedef __bf16 bfloat16_t;
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
#undef __ai
#endif

410
third_party/aarch64/clang/arm_cde.h vendored Normal file
View file

@ -0,0 +1,410 @@
/*===---- arm_cde.h - ARM CDE intrinsics -----------------------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __ARM_CDE_H
#define __ARM_CDE_H
#if !__ARM_FEATURE_CDE
#error "CDE support not enabled"
#endif
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1)))
uint32_t __arm_cx1(int, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1a)))
uint32_t __arm_cx1a(int, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1d)))
uint64_t __arm_cx1d(int, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1da)))
uint64_t __arm_cx1da(int, uint64_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2)))
uint32_t __arm_cx2(int, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2a)))
uint32_t __arm_cx2a(int, uint32_t, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2d)))
uint64_t __arm_cx2d(int, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2da)))
uint64_t __arm_cx2da(int, uint64_t, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3)))
uint32_t __arm_cx3(int, uint32_t, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3a)))
uint32_t __arm_cx3a(int, uint32_t, uint32_t, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3d)))
uint64_t __arm_cx3d(int, uint32_t, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3da)))
uint64_t __arm_cx3da(int, uint64_t, uint32_t, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1_u32)))
uint32_t __arm_vcx1_u32(int, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1a_u32)))
uint32_t __arm_vcx1a_u32(int, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1d_u64)))
uint64_t __arm_vcx1d_u64(int, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1da_u64)))
uint64_t __arm_vcx1da_u64(int, uint64_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2_u32)))
uint32_t __arm_vcx2_u32(int, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2a_u32)))
uint32_t __arm_vcx2a_u32(int, uint32_t, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2d_u64)))
uint64_t __arm_vcx2d_u64(int, uint64_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2da_u64)))
uint64_t __arm_vcx2da_u64(int, uint64_t, uint64_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3_u32)))
uint32_t __arm_vcx3_u32(int, uint32_t, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3a_u32)))
uint32_t __arm_vcx3a_u32(int, uint32_t, uint32_t, uint32_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3d_u64)))
uint64_t __arm_vcx3d_u64(int, uint64_t, uint64_t, uint32_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3da_u64)))
uint64_t __arm_vcx3da_u64(int, uint64_t, uint64_t, uint64_t, uint32_t);
#if __ARM_FEATURE_MVE
typedef uint16_t mve_pred16_t;
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) int16_t int16x8_t;
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) int32_t int32x4_t;
typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) int64_t int64x2_t;
typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) int8_t int8x16_t;
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) uint16_t uint16x8_t;
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) uint32_t uint32x4_t;
typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) uint64_t uint64x2_t;
typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) uint8_t uint8x16_t;
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s16)))
int16x8_t __arm_vcx1q_m(int, int16x8_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s32)))
int32x4_t __arm_vcx1q_m(int, int32x4_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s64)))
int64x2_t __arm_vcx1q_m(int, int64x2_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s8)))
int8x16_t __arm_vcx1q_m(int, int8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u16)))
uint16x8_t __arm_vcx1q_m(int, uint16x8_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u32)))
uint32x4_t __arm_vcx1q_m(int, uint32x4_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u64)))
uint64x2_t __arm_vcx1q_m(int, uint64x2_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u8)))
uint8x16_t __arm_vcx1q_m(int, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_u8)))
uint8x16_t __arm_vcx1q_u8(int, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s16)))
int16x8_t __arm_vcx1qa_m(int, int16x8_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s32)))
int32x4_t __arm_vcx1qa_m(int, int32x4_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s64)))
int64x2_t __arm_vcx1qa_m(int, int64x2_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s8)))
int8x16_t __arm_vcx1qa_m(int, int8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u16)))
uint16x8_t __arm_vcx1qa_m(int, uint16x8_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u32)))
uint32x4_t __arm_vcx1qa_m(int, uint32x4_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u64)))
uint64x2_t __arm_vcx1qa_m(int, uint64x2_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u8)))
uint8x16_t __arm_vcx1qa_m(int, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s16)))
int16x8_t __arm_vcx1qa(int, int16x8_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s32)))
int32x4_t __arm_vcx1qa(int, int32x4_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s64)))
int64x2_t __arm_vcx1qa(int, int64x2_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s8)))
int8x16_t __arm_vcx1qa(int, int8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u16)))
uint16x8_t __arm_vcx1qa(int, uint16x8_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u32)))
uint32x4_t __arm_vcx1qa(int, uint32x4_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u64)))
uint64x2_t __arm_vcx1qa(int, uint64x2_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u8)))
uint8x16_t __arm_vcx1qa(int, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s16)))
int16x8_t __arm_vcx2q_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s32)))
int32x4_t __arm_vcx2q_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s64)))
int64x2_t __arm_vcx2q_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s8)))
int8x16_t __arm_vcx2q_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u16)))
uint16x8_t __arm_vcx2q_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u32)))
uint32x4_t __arm_vcx2q_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u64)))
uint64x2_t __arm_vcx2q_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u8)))
uint8x16_t __arm_vcx2q_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s16)))
int16x8_t __arm_vcx2q(int, int16x8_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s32)))
int32x4_t __arm_vcx2q(int, int32x4_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s64)))
int64x2_t __arm_vcx2q(int, int64x2_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s8)))
int8x16_t __arm_vcx2q(int, int8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u16)))
uint16x8_t __arm_vcx2q(int, uint16x8_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u32)))
uint32x4_t __arm_vcx2q(int, uint32x4_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u64)))
uint64x2_t __arm_vcx2q(int, uint64x2_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8)))
uint8x16_t __arm_vcx2q(int, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s16)))
uint8x16_t __arm_vcx2q_u8(int, int16x8_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s32)))
uint8x16_t __arm_vcx2q_u8(int, int32x4_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s64)))
uint8x16_t __arm_vcx2q_u8(int, int64x2_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s8)))
uint8x16_t __arm_vcx2q_u8(int, int8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u16)))
uint8x16_t __arm_vcx2q_u8(int, uint16x8_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u32)))
uint8x16_t __arm_vcx2q_u8(int, uint32x4_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u64)))
uint8x16_t __arm_vcx2q_u8(int, uint64x2_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u8)))
uint8x16_t __arm_vcx2q_u8(int, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s16)))
int16x8_t __arm_vcx2qa_impl(int, int16x8_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s32)))
int32x4_t __arm_vcx2qa_impl(int, int32x4_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s64)))
int64x2_t __arm_vcx2qa_impl(int, int64x2_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s8)))
int8x16_t __arm_vcx2qa_impl(int, int8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u16)))
uint16x8_t __arm_vcx2qa_impl(int, uint16x8_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u32)))
uint32x4_t __arm_vcx2qa_impl(int, uint32x4_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u64)))
uint64x2_t __arm_vcx2qa_impl(int, uint64x2_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u8)))
uint8x16_t __arm_vcx2qa_impl(int, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s16)))
int16x8_t __arm_vcx2qa_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s32)))
int32x4_t __arm_vcx2qa_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s64)))
int64x2_t __arm_vcx2qa_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s8)))
int8x16_t __arm_vcx2qa_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u16)))
uint16x8_t __arm_vcx2qa_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u32)))
uint32x4_t __arm_vcx2qa_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u64)))
uint64x2_t __arm_vcx2qa_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u8)))
uint8x16_t __arm_vcx2qa_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s16)))
int16x8_t __arm_vcx3q_impl(int, int16x8_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s32)))
int32x4_t __arm_vcx3q_impl(int, int32x4_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s64)))
int64x2_t __arm_vcx3q_impl(int, int64x2_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s8)))
int8x16_t __arm_vcx3q_impl(int, int8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u16)))
uint16x8_t __arm_vcx3q_impl(int, uint16x8_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u32)))
uint32x4_t __arm_vcx3q_impl(int, uint32x4_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u64)))
uint64x2_t __arm_vcx3q_impl(int, uint64x2_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u8)))
uint8x16_t __arm_vcx3q_impl(int, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s16)))
int16x8_t __arm_vcx3q_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s32)))
int32x4_t __arm_vcx3q_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s64)))
int64x2_t __arm_vcx3q_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s8)))
int8x16_t __arm_vcx3q_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u16)))
uint16x8_t __arm_vcx3q_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u32)))
uint32x4_t __arm_vcx3q_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u64)))
uint64x2_t __arm_vcx3q_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u8)))
uint8x16_t __arm_vcx3q_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s16)))
uint8x16_t __arm_vcx3q_u8_impl(int, int16x8_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s32)))
uint8x16_t __arm_vcx3q_u8_impl(int, int32x4_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s64)))
uint8x16_t __arm_vcx3q_u8_impl(int, int64x2_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s8)))
uint8x16_t __arm_vcx3q_u8_impl(int, int8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u16)))
uint8x16_t __arm_vcx3q_u8_impl(int, uint16x8_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u32)))
uint8x16_t __arm_vcx3q_u8_impl(int, uint32x4_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u64)))
uint8x16_t __arm_vcx3q_u8_impl(int, uint64x2_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u8)))
uint8x16_t __arm_vcx3q_u8_impl(int, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s16)))
int16x8_t __arm_vcx3qa_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s32)))
int32x4_t __arm_vcx3qa_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s64)))
int64x2_t __arm_vcx3qa_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s8)))
int8x16_t __arm_vcx3qa_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u16)))
uint16x8_t __arm_vcx3qa_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u32)))
uint32x4_t __arm_vcx3qa_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u64)))
uint64x2_t __arm_vcx3qa_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u8)))
uint8x16_t __arm_vcx3qa_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s16)))
int16x8_t __arm_vcx3qa_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s32)))
int32x4_t __arm_vcx3qa_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s64)))
int64x2_t __arm_vcx3qa_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s8)))
int8x16_t __arm_vcx3qa_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u16)))
uint16x8_t __arm_vcx3qa_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u32)))
uint32x4_t __arm_vcx3qa_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u64)))
uint64x2_t __arm_vcx3qa_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u8)))
uint8x16_t __arm_vcx3qa_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
int16x8_t __arm_vreinterpretq_s16_u8(uint8x16_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
int32x4_t __arm_vreinterpretq_s32_u8(uint8x16_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
int64x2_t __arm_vreinterpretq_s64_u8(uint8x16_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
int8x16_t __arm_vreinterpretq_s8_u8(uint8x16_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
uint16x8_t __arm_vreinterpretq_u16_u8(uint8x16_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
uint32x4_t __arm_vreinterpretq_u32_u8(uint8x16_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
uint64x2_t __arm_vreinterpretq_u64_u8(uint8x16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
uint8x16_t __arm_vreinterpretq_u8(int16x8_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
uint8x16_t __arm_vreinterpretq_u8(int32x4_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
uint8x16_t __arm_vreinterpretq_u8(int64x2_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
uint8x16_t __arm_vreinterpretq_u8(int8x16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
uint8x16_t __arm_vreinterpretq_u8(uint16x8_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
uint8x16_t __arm_vreinterpretq_u8(uint32x4_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
uint8x16_t __arm_vreinterpretq_u8(uint64x2_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vreinterpretq_u8_u8)))
uint8x16_t __arm_vreinterpretq_u8(uint8x16_t);
#define __arm_vcx2q_m(cp, inactive, n, imm, pred) __arm_vcx2q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), (imm), (pred))
#define __arm_vcx2qa(cp, acc, n, imm) __arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))
#define __arm_vcx2qa_m(cp, acc, n, imm, pred) __arm_vcx2qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm), (pred))
#define __arm_vcx3q(cp, n, m, imm) __arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
#define __arm_vcx3q_m(cp, inactive, n, m, imm, pred) __arm_vcx3q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
#define __arm_vcx3q_u8(cp, n, m, imm) __arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
#define __arm_vcx3qa(cp, acc, n, m, imm) __arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm))
#define __arm_vcx3qa_m(cp, acc, n, m, imm, pred) __arm_vcx3qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
#endif /* __ARM_FEATURE_MVE */
#if __ARM_FEATURE_MVE & 2
typedef __fp16 float16_t;
typedef float float32_t;
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) float16_t float16x8_t;
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) float32_t float32x4_t;
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f16)))
float16x8_t __arm_vcx1q_m(int, float16x8_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f32)))
float32x4_t __arm_vcx1q_m(int, float32x4_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f16)))
float16x8_t __arm_vcx1qa(int, float16x8_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f32)))
float32x4_t __arm_vcx1qa(int, float32x4_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f16)))
float16x8_t __arm_vcx1qa_m(int, float16x8_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f32)))
float32x4_t __arm_vcx1qa_m(int, float32x4_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f16)))
float16x8_t __arm_vcx2q(int, float16x8_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f32)))
float32x4_t __arm_vcx2q(int, float32x4_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f16)))
float16x8_t __arm_vcx2q_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f32)))
float32x4_t __arm_vcx2q_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f16)))
uint8x16_t __arm_vcx2q_u8(int, float16x8_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f32)))
uint8x16_t __arm_vcx2q_u8(int, float32x4_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f16)))
float16x8_t __arm_vcx2qa_impl(int, float16x8_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f32)))
float32x4_t __arm_vcx2qa_impl(int, float32x4_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f16)))
float16x8_t __arm_vcx2qa_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f32)))
float32x4_t __arm_vcx2qa_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f16)))
float16x8_t __arm_vcx3q_impl(int, float16x8_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f32)))
float32x4_t __arm_vcx3q_impl(int, float32x4_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f16)))
float16x8_t __arm_vcx3q_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f32)))
float32x4_t __arm_vcx3q_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f16)))
uint8x16_t __arm_vcx3q_u8_impl(int, float16x8_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f32)))
uint8x16_t __arm_vcx3q_u8_impl(int, float32x4_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f16)))
float16x8_t __arm_vcx3qa_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f32)))
float32x4_t __arm_vcx3qa_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f16)))
float16x8_t __arm_vcx3qa_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f32)))
float32x4_t __arm_vcx3qa_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
float16x8_t __arm_vreinterpretq_f16_u8(uint8x16_t);
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
float32x4_t __arm_vreinterpretq_f32_u8(uint8x16_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
uint8x16_t __arm_vreinterpretq_u8(float16x8_t);
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
uint8x16_t __arm_vreinterpretq_u8(float32x4_t);
#endif /* __ARM_FEATURE_MVE & 2 */
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* __ARM_CDE_H */

217
third_party/aarch64/clang/arm_cmse.h vendored Normal file
View file

@ -0,0 +1,217 @@
//===---- arm_cmse.h - Arm CMSE support -----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef __ARM_CMSE_H
#define __ARM_CMSE_H
#if (__ARM_FEATURE_CMSE & 0x1)
#include <stddef.h>
#include <stdint.h>
#define __ARM_CMSE_SECURE_MODE (__ARM_FEATURE_CMSE & 0x2)
#define CMSE_MPU_READWRITE 1 /* checks if readwrite_ok field is set */
#define CMSE_AU_NONSECURE 2 /* checks if permissions have secure field unset */
#define CMSE_MPU_UNPRIV 4 /* sets T flag on TT insrtuction */
#define CMSE_MPU_READ 8 /* checks if read_ok field is set */
#define CMSE_MPU_NONSECURE 16 /* sets A flag, checks if secure field unset */
#define CMSE_NONSECURE (CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE)
#define cmse_check_pointed_object(p, f) \
cmse_check_address_range((p), sizeof(*(p)), (f))
#if defined(__cplusplus)
extern "C" {
#endif
typedef union {
struct cmse_address_info {
#ifdef __ARM_BIG_ENDIAN
/* __ARM_BIG_ENDIAN */
#if (__ARM_CMSE_SECURE_MODE)
unsigned idau_region : 8;
unsigned idau_region_valid : 1;
unsigned secure : 1;
unsigned nonsecure_readwrite_ok : 1;
unsigned nonsecure_read_ok : 1;
#else
unsigned : 12;
#endif
unsigned readwrite_ok : 1;
unsigned read_ok : 1;
#if (__ARM_CMSE_SECURE_MODE)
unsigned sau_region_valid : 1;
#else
unsigned : 1;
#endif
unsigned mpu_region_valid : 1;
#if (__ARM_CMSE_SECURE_MODE)
unsigned sau_region : 8;
#else
unsigned : 8;
#endif
unsigned mpu_region : 8;
#else /* __ARM_LITTLE_ENDIAN */
unsigned mpu_region : 8;
#if (__ARM_CMSE_SECURE_MODE)
unsigned sau_region : 8;
#else
unsigned : 8;
#endif
unsigned mpu_region_valid : 1;
#if (__ARM_CMSE_SECURE_MODE)
unsigned sau_region_valid : 1;
#else
unsigned : 1;
#endif
unsigned read_ok : 1;
unsigned readwrite_ok : 1;
#if (__ARM_CMSE_SECURE_MODE)
unsigned nonsecure_read_ok : 1;
unsigned nonsecure_readwrite_ok : 1;
unsigned secure : 1;
unsigned idau_region_valid : 1;
unsigned idau_region : 8;
#else
unsigned : 12;
#endif
#endif /*__ARM_LITTLE_ENDIAN */
} flags;
unsigned value;
} cmse_address_info_t;
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
cmse_TT(void *__p) {
cmse_address_info_t __u;
__u.value = __builtin_arm_cmse_TT(__p);
return __u;
}
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
cmse_TTT(void *__p) {
cmse_address_info_t __u;
__u.value = __builtin_arm_cmse_TTT(__p);
return __u;
}
#if __ARM_CMSE_SECURE_MODE
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
cmse_TTA(void *__p) {
cmse_address_info_t __u;
__u.value = __builtin_arm_cmse_TTA(__p);
return __u;
}
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
cmse_TTAT(void *__p) {
cmse_address_info_t __u;
__u.value = __builtin_arm_cmse_TTAT(__p);
return __u;
}
#endif
#define cmse_TT_fptr(p) cmse_TT(__builtin_bit_cast(void *, (p)))
#define cmse_TTT_fptr(p) cmse_TTT(__builtin_bit_cast(void *, (p)))
#if __ARM_CMSE_SECURE_MODE
#define cmse_TTA_fptr(p) cmse_TTA(__builtin_bit_cast(void *, (p)))
#define cmse_TTAT_fptr(p) cmse_TTAT(__builtin_bit_cast(void *, (p)))
#endif
static void *__attribute__((__always_inline__))
cmse_check_address_range(void *__pb, size_t __s, int __flags) {
uintptr_t __begin = (uintptr_t)__pb;
uintptr_t __end = __begin + __s - 1;
if (__end < __begin)
return NULL; /* wrap around check */
/* Check whether the range crosses a 32-bytes aligned address */
const int __single_check = (__begin ^ __end) < 0x20u;
/* execute the right variant of the TT instructions */
void *__pe = (void *)__end;
cmse_address_info_t __permb, __perme;
switch (__flags & (CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
case 0:
__permb = cmse_TT(__pb);
__perme = __single_check ? __permb : cmse_TT(__pe);
break;
case CMSE_MPU_UNPRIV:
__permb = cmse_TTT(__pb);
__perme = __single_check ? __permb : cmse_TTT(__pe);
break;
#if __ARM_CMSE_SECURE_MODE
case CMSE_MPU_NONSECURE:
__permb = cmse_TTA(__pb);
__perme = __single_check ? __permb : cmse_TTA(__pe);
break;
case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
__permb = cmse_TTAT(__pb);
__perme = __single_check ? __permb : cmse_TTAT(__pe);
break;
#endif
/* if CMSE_NONSECURE is specified w/o __ARM_CMSE_SECURE_MODE */
default:
return NULL;
}
/* check that the range does not cross MPU, SAU, or IDAU region boundaries */
if (__permb.value != __perme.value)
return NULL;
#if !(__ARM_CMSE_SECURE_MODE)
/* CMSE_AU_NONSECURE is only supported when __ARM_FEATURE_CMSE & 0x2 */
if (__flags & CMSE_AU_NONSECURE)
return NULL;
#endif
/* check the permission on the range */
switch (__flags & ~(CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
#if (__ARM_CMSE_SECURE_MODE)
case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
case CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
return __permb.flags.nonsecure_readwrite_ok ? __pb : NULL;
case CMSE_MPU_READ | CMSE_AU_NONSECURE:
return __permb.flags.nonsecure_read_ok ? __pb : NULL;
case CMSE_AU_NONSECURE:
return __permb.flags.secure ? NULL : __pb;
#endif
case CMSE_MPU_READ | CMSE_MPU_READWRITE:
case CMSE_MPU_READWRITE:
return __permb.flags.readwrite_ok ? __pb : NULL;
case CMSE_MPU_READ:
return __permb.flags.read_ok ? __pb : NULL;
default:
return NULL;
}
}
#if __ARM_CMSE_SECURE_MODE
static int __attribute__((__always_inline__, __nodebug__))
cmse_nonsecure_caller(void) {
return !((uintptr_t)__builtin_return_address(0) & 1);
}
#define cmse_nsfptr_create(p) \
__builtin_bit_cast(__typeof__(p), \
(__builtin_bit_cast(uintptr_t, p) & ~(uintptr_t)1))
#define cmse_is_nsfptr(p) ((__builtin_bit_cast(uintptr_t, p) & 1) == 0)
#endif /* __ARM_CMSE_SECURE_MODE */
void __attribute__((__noreturn__)) cmse_abort(void);
#if defined(__cplusplus)
}
#endif
#endif /* (__ARM_FEATURE_CMSE & 0x1) */
#endif /* __ARM_CMSE_H */

596
third_party/aarch64/clang/arm_fp16.h vendored Normal file
View file

@ -0,0 +1,596 @@
/*===---- arm_fp16.h - ARM FP16 intrinsics ---------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __ARM_FP16_H
#define __ARM_FP16_H
#include <stdint.h>
typedef __fp16 float16_t;
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
#if defined(__aarch64__) || defined(__arm64ec__)
#define vabdh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vabdh_f16(__s0, __s1); \
__ret; \
})
#define vabsh_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vabsh_f16(__s0); \
__ret; \
})
#define vaddh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vaddh_f16(__s0, __s1); \
__ret; \
})
#define vcageh_f16(__p0, __p1) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (uint16_t) __builtin_neon_vcageh_f16(__s0, __s1); \
__ret; \
})
#define vcagth_f16(__p0, __p1) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (uint16_t) __builtin_neon_vcagth_f16(__s0, __s1); \
__ret; \
})
#define vcaleh_f16(__p0, __p1) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (uint16_t) __builtin_neon_vcaleh_f16(__s0, __s1); \
__ret; \
})
#define vcalth_f16(__p0, __p1) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (uint16_t) __builtin_neon_vcalth_f16(__s0, __s1); \
__ret; \
})
#define vceqh_f16(__p0, __p1) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (uint16_t) __builtin_neon_vceqh_f16(__s0, __s1); \
__ret; \
})
#define vceqzh_f16(__p0) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vceqzh_f16(__s0); \
__ret; \
})
#define vcgeh_f16(__p0, __p1) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (uint16_t) __builtin_neon_vcgeh_f16(__s0, __s1); \
__ret; \
})
#define vcgezh_f16(__p0) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vcgezh_f16(__s0); \
__ret; \
})
#define vcgth_f16(__p0, __p1) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (uint16_t) __builtin_neon_vcgth_f16(__s0, __s1); \
__ret; \
})
#define vcgtzh_f16(__p0) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vcgtzh_f16(__s0); \
__ret; \
})
#define vcleh_f16(__p0, __p1) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (uint16_t) __builtin_neon_vcleh_f16(__s0, __s1); \
__ret; \
})
#define vclezh_f16(__p0) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vclezh_f16(__s0); \
__ret; \
})
#define vclth_f16(__p0, __p1) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (uint16_t) __builtin_neon_vclth_f16(__s0, __s1); \
__ret; \
})
#define vcltzh_f16(__p0) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vcltzh_f16(__s0); \
__ret; \
})
#define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \
int16_t __ret; \
float16_t __s0 = __p0; \
__ret = (int16_t) __builtin_neon_vcvth_n_s16_f16(__s0, __p1); \
__ret; \
})
#define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \
int32_t __ret; \
float16_t __s0 = __p0; \
__ret = (int32_t) __builtin_neon_vcvth_n_s32_f16(__s0, __p1); \
__ret; \
})
#define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \
int64_t __ret; \
float16_t __s0 = __p0; \
__ret = (int64_t) __builtin_neon_vcvth_n_s64_f16(__s0, __p1); \
__ret; \
})
#define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vcvth_n_u16_f16(__s0, __p1); \
__ret; \
})
#define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \
uint32_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint32_t) __builtin_neon_vcvth_n_u32_f16(__s0, __p1); \
__ret; \
})
#define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \
uint64_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint64_t) __builtin_neon_vcvth_n_u64_f16(__s0, __p1); \
__ret; \
})
#define vcvth_s16_f16(__p0) __extension__ ({ \
int16_t __ret; \
float16_t __s0 = __p0; \
__ret = (int16_t) __builtin_neon_vcvth_s16_f16(__s0); \
__ret; \
})
#define vcvth_s32_f16(__p0) __extension__ ({ \
int32_t __ret; \
float16_t __s0 = __p0; \
__ret = (int32_t) __builtin_neon_vcvth_s32_f16(__s0); \
__ret; \
})
#define vcvth_s64_f16(__p0) __extension__ ({ \
int64_t __ret; \
float16_t __s0 = __p0; \
__ret = (int64_t) __builtin_neon_vcvth_s64_f16(__s0); \
__ret; \
})
#define vcvth_u16_f16(__p0) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vcvth_u16_f16(__s0); \
__ret; \
})
#define vcvth_u32_f16(__p0) __extension__ ({ \
uint32_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint32_t) __builtin_neon_vcvth_u32_f16(__s0); \
__ret; \
})
#define vcvth_u64_f16(__p0) __extension__ ({ \
uint64_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint64_t) __builtin_neon_vcvth_u64_f16(__s0); \
__ret; \
})
#define vcvtah_s16_f16(__p0) __extension__ ({ \
int16_t __ret; \
float16_t __s0 = __p0; \
__ret = (int16_t) __builtin_neon_vcvtah_s16_f16(__s0); \
__ret; \
})
#define vcvtah_s32_f16(__p0) __extension__ ({ \
int32_t __ret; \
float16_t __s0 = __p0; \
__ret = (int32_t) __builtin_neon_vcvtah_s32_f16(__s0); \
__ret; \
})
#define vcvtah_s64_f16(__p0) __extension__ ({ \
int64_t __ret; \
float16_t __s0 = __p0; \
__ret = (int64_t) __builtin_neon_vcvtah_s64_f16(__s0); \
__ret; \
})
#define vcvtah_u16_f16(__p0) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vcvtah_u16_f16(__s0); \
__ret; \
})
#define vcvtah_u32_f16(__p0) __extension__ ({ \
uint32_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint32_t) __builtin_neon_vcvtah_u32_f16(__s0); \
__ret; \
})
#define vcvtah_u64_f16(__p0) __extension__ ({ \
uint64_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint64_t) __builtin_neon_vcvtah_u64_f16(__s0); \
__ret; \
})
#define vcvth_f16_u16(__p0) __extension__ ({ \
float16_t __ret; \
uint16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_f16_u16(__s0); \
__ret; \
})
#define vcvth_f16_s16(__p0) __extension__ ({ \
float16_t __ret; \
int16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_f16_s16(__s0); \
__ret; \
})
#define vcvth_f16_u32(__p0) __extension__ ({ \
float16_t __ret; \
uint32_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_f16_u32(__s0); \
__ret; \
})
#define vcvth_f16_s32(__p0) __extension__ ({ \
float16_t __ret; \
int32_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_f16_s32(__s0); \
__ret; \
})
#define vcvth_f16_u64(__p0) __extension__ ({ \
float16_t __ret; \
uint64_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_f16_u64(__s0); \
__ret; \
})
#define vcvth_f16_s64(__p0) __extension__ ({ \
float16_t __ret; \
int64_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_f16_s64(__s0); \
__ret; \
})
#define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \
float16_t __ret; \
uint32_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_n_f16_u32(__s0, __p1); \
__ret; \
})
#define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \
float16_t __ret; \
int32_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_n_f16_s32(__s0, __p1); \
__ret; \
})
#define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \
float16_t __ret; \
uint64_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_n_f16_u64(__s0, __p1); \
__ret; \
})
#define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \
float16_t __ret; \
int64_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_n_f16_s64(__s0, __p1); \
__ret; \
})
#define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
uint16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_n_f16_u16(__s0, __p1); \
__ret; \
})
#define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
int16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vcvth_n_f16_s16(__s0, __p1); \
__ret; \
})
#define vcvtmh_s16_f16(__p0) __extension__ ({ \
int16_t __ret; \
float16_t __s0 = __p0; \
__ret = (int16_t) __builtin_neon_vcvtmh_s16_f16(__s0); \
__ret; \
})
#define vcvtmh_s32_f16(__p0) __extension__ ({ \
int32_t __ret; \
float16_t __s0 = __p0; \
__ret = (int32_t) __builtin_neon_vcvtmh_s32_f16(__s0); \
__ret; \
})
#define vcvtmh_s64_f16(__p0) __extension__ ({ \
int64_t __ret; \
float16_t __s0 = __p0; \
__ret = (int64_t) __builtin_neon_vcvtmh_s64_f16(__s0); \
__ret; \
})
#define vcvtmh_u16_f16(__p0) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vcvtmh_u16_f16(__s0); \
__ret; \
})
#define vcvtmh_u32_f16(__p0) __extension__ ({ \
uint32_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint32_t) __builtin_neon_vcvtmh_u32_f16(__s0); \
__ret; \
})
#define vcvtmh_u64_f16(__p0) __extension__ ({ \
uint64_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint64_t) __builtin_neon_vcvtmh_u64_f16(__s0); \
__ret; \
})
#define vcvtnh_s16_f16(__p0) __extension__ ({ \
int16_t __ret; \
float16_t __s0 = __p0; \
__ret = (int16_t) __builtin_neon_vcvtnh_s16_f16(__s0); \
__ret; \
})
#define vcvtnh_s32_f16(__p0) __extension__ ({ \
int32_t __ret; \
float16_t __s0 = __p0; \
__ret = (int32_t) __builtin_neon_vcvtnh_s32_f16(__s0); \
__ret; \
})
#define vcvtnh_s64_f16(__p0) __extension__ ({ \
int64_t __ret; \
float16_t __s0 = __p0; \
__ret = (int64_t) __builtin_neon_vcvtnh_s64_f16(__s0); \
__ret; \
})
#define vcvtnh_u16_f16(__p0) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vcvtnh_u16_f16(__s0); \
__ret; \
})
#define vcvtnh_u32_f16(__p0) __extension__ ({ \
uint32_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint32_t) __builtin_neon_vcvtnh_u32_f16(__s0); \
__ret; \
})
#define vcvtnh_u64_f16(__p0) __extension__ ({ \
uint64_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint64_t) __builtin_neon_vcvtnh_u64_f16(__s0); \
__ret; \
})
#define vcvtph_s16_f16(__p0) __extension__ ({ \
int16_t __ret; \
float16_t __s0 = __p0; \
__ret = (int16_t) __builtin_neon_vcvtph_s16_f16(__s0); \
__ret; \
})
#define vcvtph_s32_f16(__p0) __extension__ ({ \
int32_t __ret; \
float16_t __s0 = __p0; \
__ret = (int32_t) __builtin_neon_vcvtph_s32_f16(__s0); \
__ret; \
})
#define vcvtph_s64_f16(__p0) __extension__ ({ \
int64_t __ret; \
float16_t __s0 = __p0; \
__ret = (int64_t) __builtin_neon_vcvtph_s64_f16(__s0); \
__ret; \
})
#define vcvtph_u16_f16(__p0) __extension__ ({ \
uint16_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint16_t) __builtin_neon_vcvtph_u16_f16(__s0); \
__ret; \
})
#define vcvtph_u32_f16(__p0) __extension__ ({ \
uint32_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint32_t) __builtin_neon_vcvtph_u32_f16(__s0); \
__ret; \
})
#define vcvtph_u64_f16(__p0) __extension__ ({ \
uint64_t __ret; \
float16_t __s0 = __p0; \
__ret = (uint64_t) __builtin_neon_vcvtph_u64_f16(__s0); \
__ret; \
})
#define vdivh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vdivh_f16(__s0, __s1); \
__ret; \
})
#define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
float16_t __s2 = __p2; \
__ret = (float16_t) __builtin_neon_vfmah_f16(__s0, __s1, __s2); \
__ret; \
})
#define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
float16_t __s2 = __p2; \
__ret = (float16_t) __builtin_neon_vfmsh_f16(__s0, __s1, __s2); \
__ret; \
})
#define vmaxh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vmaxh_f16(__s0, __s1); \
__ret; \
})
#define vmaxnmh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vmaxnmh_f16(__s0, __s1); \
__ret; \
})
#define vminh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vminh_f16(__s0, __s1); \
__ret; \
})
#define vminnmh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vminnmh_f16(__s0, __s1); \
__ret; \
})
#define vmulh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vmulh_f16(__s0, __s1); \
__ret; \
})
#define vmulxh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vmulxh_f16(__s0, __s1); \
__ret; \
})
#define vnegh_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vnegh_f16(__s0); \
__ret; \
})
#define vrecpeh_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vrecpeh_f16(__s0); \
__ret; \
})
#define vrecpsh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vrecpsh_f16(__s0, __s1); \
__ret; \
})
#define vrecpxh_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vrecpxh_f16(__s0); \
__ret; \
})
#define vrndh_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vrndh_f16(__s0); \
__ret; \
})
#define vrndah_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vrndah_f16(__s0); \
__ret; \
})
#define vrndih_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vrndih_f16(__s0); \
__ret; \
})
#define vrndmh_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vrndmh_f16(__s0); \
__ret; \
})
#define vrndnh_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vrndnh_f16(__s0); \
__ret; \
})
#define vrndph_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vrndph_f16(__s0); \
__ret; \
})
#define vrndxh_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vrndxh_f16(__s0); \
__ret; \
})
#define vrsqrteh_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vrsqrteh_f16(__s0); \
__ret; \
})
#define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vrsqrtsh_f16(__s0, __s1); \
__ret; \
})
#define vsqrth_f16(__p0) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
__ret = (float16_t) __builtin_neon_vsqrth_f16(__s0); \
__ret; \
})
#define vsubh_f16(__p0, __p1) __extension__ ({ \
float16_t __ret; \
float16_t __s0 = __p0; \
float16_t __s1 = __p1; \
__ret = (float16_t) __builtin_neon_vsubh_f16(__s0, __s1); \
__ret; \
})
#endif
#undef __ai
#endif /* __ARM_FP16_H */

19187
third_party/aarch64/clang/arm_mve.h vendored Normal file

File diff suppressed because it is too large Load diff

69638
third_party/aarch64/clang/arm_neon.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,182 @@
/*===---- arm_neon_sve_bridge.h - ARM NEON SVE Bridge intrinsics -----------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __ARM_NEON_SVE_BRIDGE_H
#define __ARM_NEON_SVE_BRIDGE_H
#include <arm_neon.h>
#include <arm_sve.h>
#ifdef __cplusplus
extern "C" {
#endif
/* Function attributes */
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
#define __aio \
static __inline__ \
__attribute__((__always_inline__, __nodebug__, __overloadable__))
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
svint8_t svset_neonq(svint8_t, int8x16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
svint16_t svset_neonq(svint16_t, int16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
svint32_t svset_neonq(svint32_t, int32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
svint64_t svset_neonq(svint64_t, int64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
svuint8_t svset_neonq(svuint8_t, uint8x16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
svuint16_t svset_neonq(svuint16_t, uint16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
svuint32_t svset_neonq(svuint32_t, uint32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
svuint64_t svset_neonq(svuint64_t, uint64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
svfloat16_t svset_neonq(svfloat16_t, float16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
svfloat32_t svset_neonq(svfloat32_t, float32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
svfloat64_t svset_neonq(svfloat64_t, float64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
svint8_t svset_neonq_s8(svint8_t, int8x16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
svint16_t svset_neonq_s16(svint16_t, int16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
svint32_t svset_neonq_s32(svint32_t, int32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
svint64_t svset_neonq_s64(svint64_t, int64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
svuint8_t svset_neonq_u8(svuint8_t, uint8x16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
svuint16_t svset_neonq_u16(svuint16_t, uint16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
svuint32_t svset_neonq_u32(svuint32_t, uint32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
svuint64_t svset_neonq_u64(svuint64_t, uint64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
svfloat16_t svset_neonq_f16(svfloat16_t, float16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
svfloat32_t svset_neonq_f32(svfloat32_t, float32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
svfloat64_t svset_neonq_f64(svfloat64_t, float64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
int8x16_t svget_neonq(svint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
int16x8_t svget_neonq(svint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
int32x4_t svget_neonq(svint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
int64x2_t svget_neonq(svint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
uint8x16_t svget_neonq(svuint8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
uint16x8_t svget_neonq(svuint16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
uint32x4_t svget_neonq(svuint32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
uint64x2_t svget_neonq(svuint64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
float16x8_t svget_neonq(svfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
float32x4_t svget_neonq(svfloat32_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
float64x2_t svget_neonq(svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
int8x16_t svget_neonq_s8(svint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
int16x8_t svget_neonq_s16(svint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
int32x4_t svget_neonq_s32(svint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
int64x2_t svget_neonq_s64(svint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
uint8x16_t svget_neonq_u8(svuint8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
uint16x8_t svget_neonq_u16(svuint16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
uint32x4_t svget_neonq_u32(svuint32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
uint64x2_t svget_neonq_u64(svuint64_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
float16x8_t svget_neonq_f16(svfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
float32x4_t svget_neonq_f32(svfloat32_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
float64x2_t svget_neonq_f64(svfloat64_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
svint8_t svdup_neonq(int8x16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
svint16_t svdup_neonq(int16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
svint32_t svdup_neonq(int32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
svint64_t svdup_neonq(int64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
svuint8_t svdup_neonq(uint8x16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
svuint16_t svdup_neonq(uint16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
svuint32_t svdup_neonq(uint32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
svuint64_t svdup_neonq(uint64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
svfloat16_t svdup_neonq(float16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
svfloat32_t svdup_neonq(float32x4_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
svfloat64_t svdup_neonq(float64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
svint8_t svdup_neonq_s8(int8x16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
svint16_t svdup_neonq_s16(int16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
svint32_t svdup_neonq_s32(int32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
svint64_t svdup_neonq_s64(int64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
svuint8_t svdup_neonq_u8(uint8x16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
svuint16_t svdup_neonq_u16(uint16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
svuint32_t svdup_neonq_u32(uint32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
svuint64_t svdup_neonq_u64(uint64x2_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
svfloat16_t svdup_neonq_f16(float16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
svfloat32_t svdup_neonq_f32(float32x4_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
svfloat64_t svdup_neonq_f64(float64x2_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
svbfloat16_t svset_neonq_bf16(svbfloat16_t, bfloat16x8_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
bfloat16x8_t svget_neonq(svbfloat16_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
bfloat16x8_t svget_neonq_bf16(svbfloat16_t);
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
svbfloat16_t svdup_neonq(bfloat16x8_t);
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
#undef __ai
#undef __aio
#ifdef __cplusplus
} // extern "C"
#endif
#endif //__ARM_NEON_SVE_BRIDGE_H

2819
third_party/aarch64/clang/arm_sme.h vendored Normal file

File diff suppressed because it is too large Load diff

30537
third_party/aarch64/clang/arm_sve.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,345 @@
/*===---- arm_vector_types - ARM vector type ------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#if !defined(__ARM_NEON_H) && !defined(__ARM_SVE_H)
#error "This file should not be used standalone. Please include arm_neon.h or arm_sve.h instead"
#endif
#ifndef __ARM_NEON_TYPES_H
#define __ARM_NEON_TYPES_H
typedef float float32_t;
typedef __fp16 float16_t;
#if defined(__aarch64__) || defined(__arm64ec__)
typedef double float64_t;
#endif
typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
#if defined(__aarch64__) || defined(__arm64ec__)
typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
#endif
typedef struct int8x8x2_t {
int8x8_t val[2];
} int8x8x2_t;
typedef struct int8x16x2_t {
int8x16_t val[2];
} int8x16x2_t;
typedef struct int16x4x2_t {
int16x4_t val[2];
} int16x4x2_t;
typedef struct int16x8x2_t {
int16x8_t val[2];
} int16x8x2_t;
typedef struct int32x2x2_t {
int32x2_t val[2];
} int32x2x2_t;
typedef struct int32x4x2_t {
int32x4_t val[2];
} int32x4x2_t;
typedef struct int64x1x2_t {
int64x1_t val[2];
} int64x1x2_t;
typedef struct int64x2x2_t {
int64x2_t val[2];
} int64x2x2_t;
typedef struct uint8x8x2_t {
uint8x8_t val[2];
} uint8x8x2_t;
typedef struct uint8x16x2_t {
uint8x16_t val[2];
} uint8x16x2_t;
typedef struct uint16x4x2_t {
uint16x4_t val[2];
} uint16x4x2_t;
typedef struct uint16x8x2_t {
uint16x8_t val[2];
} uint16x8x2_t;
typedef struct uint32x2x2_t {
uint32x2_t val[2];
} uint32x2x2_t;
typedef struct uint32x4x2_t {
uint32x4_t val[2];
} uint32x4x2_t;
typedef struct uint64x1x2_t {
uint64x1_t val[2];
} uint64x1x2_t;
typedef struct uint64x2x2_t {
uint64x2_t val[2];
} uint64x2x2_t;
typedef struct float16x4x2_t {
float16x4_t val[2];
} float16x4x2_t;
typedef struct float16x8x2_t {
float16x8_t val[2];
} float16x8x2_t;
typedef struct float32x2x2_t {
float32x2_t val[2];
} float32x2x2_t;
typedef struct float32x4x2_t {
float32x4_t val[2];
} float32x4x2_t;
#if defined(__aarch64__) || defined(__arm64ec__)
typedef struct float64x1x2_t {
float64x1_t val[2];
} float64x1x2_t;
typedef struct float64x2x2_t {
float64x2_t val[2];
} float64x2x2_t;
#endif
typedef struct int8x8x3_t {
int8x8_t val[3];
} int8x8x3_t;
typedef struct int8x16x3_t {
int8x16_t val[3];
} int8x16x3_t;
typedef struct int16x4x3_t {
int16x4_t val[3];
} int16x4x3_t;
typedef struct int16x8x3_t {
int16x8_t val[3];
} int16x8x3_t;
typedef struct int32x2x3_t {
int32x2_t val[3];
} int32x2x3_t;
typedef struct int32x4x3_t {
int32x4_t val[3];
} int32x4x3_t;
typedef struct int64x1x3_t {
int64x1_t val[3];
} int64x1x3_t;
typedef struct int64x2x3_t {
int64x2_t val[3];
} int64x2x3_t;
typedef struct uint8x8x3_t {
uint8x8_t val[3];
} uint8x8x3_t;
typedef struct uint8x16x3_t {
uint8x16_t val[3];
} uint8x16x3_t;
typedef struct uint16x4x3_t {
uint16x4_t val[3];
} uint16x4x3_t;
typedef struct uint16x8x3_t {
uint16x8_t val[3];
} uint16x8x3_t;
typedef struct uint32x2x3_t {
uint32x2_t val[3];
} uint32x2x3_t;
typedef struct uint32x4x3_t {
uint32x4_t val[3];
} uint32x4x3_t;
typedef struct uint64x1x3_t {
uint64x1_t val[3];
} uint64x1x3_t;
typedef struct uint64x2x3_t {
uint64x2_t val[3];
} uint64x2x3_t;
typedef struct float16x4x3_t {
float16x4_t val[3];
} float16x4x3_t;
typedef struct float16x8x3_t {
float16x8_t val[3];
} float16x8x3_t;
typedef struct float32x2x3_t {
float32x2_t val[3];
} float32x2x3_t;
typedef struct float32x4x3_t {
float32x4_t val[3];
} float32x4x3_t;
#if defined(__aarch64__) || defined(__arm64ec__)
typedef struct float64x1x3_t {
float64x1_t val[3];
} float64x1x3_t;
typedef struct float64x2x3_t {
float64x2_t val[3];
} float64x2x3_t;
#endif
typedef struct int8x8x4_t {
int8x8_t val[4];
} int8x8x4_t;
typedef struct int8x16x4_t {
int8x16_t val[4];
} int8x16x4_t;
typedef struct int16x4x4_t {
int16x4_t val[4];
} int16x4x4_t;
typedef struct int16x8x4_t {
int16x8_t val[4];
} int16x8x4_t;
typedef struct int32x2x4_t {
int32x2_t val[4];
} int32x2x4_t;
typedef struct int32x4x4_t {
int32x4_t val[4];
} int32x4x4_t;
typedef struct int64x1x4_t {
int64x1_t val[4];
} int64x1x4_t;
typedef struct int64x2x4_t {
int64x2_t val[4];
} int64x2x4_t;
typedef struct uint8x8x4_t {
uint8x8_t val[4];
} uint8x8x4_t;
typedef struct uint8x16x4_t {
uint8x16_t val[4];
} uint8x16x4_t;
typedef struct uint16x4x4_t {
uint16x4_t val[4];
} uint16x4x4_t;
typedef struct uint16x8x4_t {
uint16x8_t val[4];
} uint16x8x4_t;
typedef struct uint32x2x4_t {
uint32x2_t val[4];
} uint32x2x4_t;
typedef struct uint32x4x4_t {
uint32x4_t val[4];
} uint32x4x4_t;
typedef struct uint64x1x4_t {
uint64x1_t val[4];
} uint64x1x4_t;
typedef struct uint64x2x4_t {
uint64x2_t val[4];
} uint64x2x4_t;
typedef struct float16x4x4_t {
float16x4_t val[4];
} float16x4x4_t;
typedef struct float16x8x4_t {
float16x8_t val[4];
} float16x8x4_t;
typedef struct float32x2x4_t {
float32x2_t val[4];
} float32x2x4_t;
typedef struct float32x4x4_t {
float32x4_t val[4];
} float32x4x4_t;
#if defined(__aarch64__) || defined(__arm64ec__)
typedef struct float64x1x4_t {
float64x1_t val[4];
} float64x1x4_t;
typedef struct float64x2x4_t {
float64x2_t val[4];
} float64x2x4_t;
#endif
typedef __attribute__((neon_vector_type(4))) bfloat16_t bfloat16x4_t;
typedef __attribute__((neon_vector_type(8))) bfloat16_t bfloat16x8_t;
typedef struct bfloat16x4x2_t {
bfloat16x4_t val[2];
} bfloat16x4x2_t;
typedef struct bfloat16x8x2_t {
bfloat16x8_t val[2];
} bfloat16x8x2_t;
typedef struct bfloat16x4x3_t {
bfloat16x4_t val[3];
} bfloat16x4x3_t;
typedef struct bfloat16x8x3_t {
bfloat16x8_t val[3];
} bfloat16x8x3_t;
typedef struct bfloat16x4x4_t {
bfloat16x4_t val[4];
} bfloat16x4x4_t;
typedef struct bfloat16x8x4_t {
bfloat16x8_t val[4];
} bfloat16x8x4_t;
#endif // __ARM_NEON_TYPES_H

31
third_party/aarch64/clang/armintr.h vendored Normal file
View file

@ -0,0 +1,31 @@
/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
/* Only include this if we're compiling for the windows platform. */
#ifndef _MSC_VER
#include_next <armintr.h>
#else
#ifndef __ARMINTR_H
#define __ARMINTR_H
typedef enum
{
_ARM_BARRIER_SY = 0xF,
_ARM_BARRIER_ST = 0xE,
_ARM_BARRIER_ISH = 0xB,
_ARM_BARRIER_ISHST = 0xA,
_ARM_BARRIER_NSH = 0x7,
_ARM_BARRIER_NSHST = 0x6,
_ARM_BARRIER_OSH = 0x3,
_ARM_BARRIER_OSHST = 0x2
} _ARMINTR_BARRIER_TYPE;
#endif /* __ARMINTR_H */
#endif /* _MSC_VER */

View file

@ -495,7 +495,7 @@ makearraystring(Node *p, const char *func)
if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
FATAL("%s: out of memory %s[%s...]",
func, x->nval, buf);
func ? func : "NULL", x->nval, buf);
}
memcpy(buf + blen, s, slen);
if (nsub) {

View file

@ -34,7 +34,8 @@ THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS = \
LIBC_MEM \
LIBC_STR \
LIBC_TINYMATH \
THIRD_PARTY_LIBCXXABI
THIRD_PARTY_LIBCXXABI \
THIRD_PARTY_LIBUNWIND
THIRD_PARTY_DOUBLECONVERSION_A_DEPS := \
$(call uniq,$(foreach x,$(THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS),$($(x))))

View file

@ -3,4 +3,4 @@
PKGS += THIRD_PARTY_INTEL
THIRD_PARTY_INTEL_HDRS = $(filter %.h,$(THIRD_PARTY_INTEL_FILES))
THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*)
THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*) $(wildcard third_party/intel/clang/*)

View file

@ -0,0 +1,140 @@
/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __WMMINTRIN_H
#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
#endif
#ifndef __WMMINTRIN_AES_H
#define __WMMINTRIN_AES_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
/// Performs a single round of AES encryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the encrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesenc_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
}
/// Performs the final round of AES encryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the encrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesenclast_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
}
/// Performs a single round of AES decryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the decrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesdec_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
}
/// Performs the final round of AES decryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the state value.
/// \param __R
/// A 128-bit integer vector containing the round key value.
/// \returns A 128-bit integer vector containing the decrypted value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesdeclast_si128(__m128i __V, __m128i __R)
{
return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
}
/// Applies the AES InvMixColumns() transformation to an expanded key
/// contained in the source operand, and writes the result to the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
///
/// \param __V
/// A 128-bit integer vector containing the expanded key.
/// \returns A 128-bit integer vector containing the transformed value.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_aesimc_si128(__m128i __V)
{
return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
}
/// Generates a round key for AES encryption, operating on 128-bit data
/// specified in the first source operand and using an 8-bit round constant
/// specified by the second source operand, and writes the result to the
/// destination.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
/// \endcode
///
/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
///
/// \param C
/// A 128-bit integer vector that is used to generate the AES encryption key.
/// \param R
/// An 8-bit round constant used to generate the AES encryption key.
/// \returns A 128-bit round key for AES encryption.
#define _mm_aeskeygenassist_si128(C, R) \
((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
#undef __DEFAULT_FN_ATTRS
#endif /* __WMMINTRIN_AES_H */

View file

@ -0,0 +1,48 @@
/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __WMMINTRIN_H
#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
#endif
#ifndef __WMMINTRIN_PCLMUL_H
#define __WMMINTRIN_PCLMUL_H
/// Multiplies two 64-bit integer values, which are selected from source
/// operands using the immediate-value operand. The multiplication is a
/// carry-less multiplication, and the 128-bit integer product is stored in
/// the destination.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
/// \endcode
///
/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
///
/// \param X
/// A 128-bit vector of [2 x i64] containing one of the source operands.
/// \param Y
/// A 128-bit vector of [2 x i64] containing one of the source operands.
/// \param I
/// An immediate value specifying which 64-bit values to select from the
/// operands. Bit 0 is used to select a value from operand \a X, and bit
/// 4 is used to select a value from operand \a Y: \n
/// Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
/// Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
/// Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
/// Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
/// \returns The 128-bit integer vector containing the result of the carry-less
/// multiplication of the selected 64-bit values.
#define _mm_clmulepi64_si128(X, Y, I) \
((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
(__v2di)(__m128i)(Y), (char)(I)))
#endif /* __WMMINTRIN_PCLMUL_H */

160
third_party/intel/clang/adcintrin.h vendored Normal file
View file

@ -0,0 +1,160 @@
/*===---- adcintrin.h - ADC intrinsics -------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __ADCINTRIN_H
#define __ADCINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
/* Use C++ inline semantics in C++, GNU inline for C mode. */
#if defined(__cplusplus)
#define __INLINE __inline
#else
#define __INLINE static __inline
#endif
#if defined(__cplusplus)
extern "C" {
#endif
/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
/// by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
/// at \a __p, and returns the 8-bit carry-out (carry flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store32(__p, __x + __y + temp)
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c ADC instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// A 32-bit unsigned addend.
/// \param __y
/// A 32-bit unsigned addend.
/// \param __p
/// Pointer to memory for storing the sum.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
unsigned int __x,
unsigned int __y,
unsigned int *__p) {
return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
}
/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
/// flag \a __cf, and subtracts the result from unsigned 32-bit integer
/// \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
/// and returns the 8-bit carry-out (carry or overflow flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store32(__p, __x - (__y + temp))
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c SBB instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// The 32-bit unsigned minuend.
/// \param __y
/// The 32-bit unsigned subtrahend.
/// \param __p
/// Pointer to memory for storing the difference.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
unsigned int __x,
unsigned int __y,
unsigned int *__p) {
return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
}
#ifdef __x86_64__
/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
/// by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
/// at \a __p, and returns the 8-bit carry-out (carry flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store64(__p, __x + __y + temp)
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c ADC instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// A 64-bit unsigned addend.
/// \param __y
/// A 64-bit unsigned addend.
/// \param __p
/// Pointer to memory for storing the sum.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS
_addcarry_u64(unsigned char __cf, unsigned long long __x,
unsigned long long __y, unsigned long long *__p) {
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
}
/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
/// flag \a __cf, and subtracts the result from unsigned 64-bit integer
/// \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
/// and returns the 8-bit carry-out (carry or overflow flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store64(__p, __x - (__y + temp))
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c ADC instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// The 64-bit unsigned minuend.
/// \param __y
/// The 64-bit unsigned subtrahend.
/// \param __p
/// Pointer to memory for storing the difference.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS
_subborrow_u64(unsigned char __cf, unsigned long long __x,
unsigned long long __y, unsigned long long *__p) {
return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
}
#endif
#if defined(__cplusplus)
}
#endif
#undef __INLINE
#undef __DEFAULT_FN_ATTRS
#endif /* __ADCINTRIN_H */

102
third_party/intel/clang/adxintrin.h vendored Normal file
View file

@ -0,0 +1,102 @@
/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __ADXINTRIN_H
#define __ADXINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("adx")))
/* Use C++ inline semantics in C++, GNU inline for C mode. */
#if defined(__cplusplus)
#define __INLINE __inline
#else
#define __INLINE static __inline
#endif
#if defined(__cplusplus)
extern "C" {
#endif
/* Intrinsics that are available only if __ADX__ is defined. */
/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
/// by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
/// at \a __p, and returns the 8-bit carry-out (carry flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store32(__p, __x + __y + temp)
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c ADCX instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// A 32-bit unsigned addend.
/// \param __y
/// A 32-bit unsigned addend.
/// \param __p
/// Pointer to memory for storing the sum.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
unsigned int __x,
unsigned int __y,
unsigned int *__p) {
return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
}
#ifdef __x86_64__
/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
/// by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
/// at \a __p, and returns the 8-bit carry-out (carry flag).
///
/// \code{.operation}
/// temp := (__cf == 0) ? 0 : 1
/// Store64(__p, __x + __y + temp)
/// result := CF
/// \endcode
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the \c ADCX instruction.
///
/// \param __cf
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
/// \param __x
/// A 64-bit unsigned addend.
/// \param __y
/// A 64-bit unsigned addend.
/// \param __p
/// Pointer to memory for storing the sum.
/// \returns The 8-bit unsigned carry-out value.
__INLINE unsigned char __DEFAULT_FN_ATTRS
_addcarryx_u64(unsigned char __cf, unsigned long long __x,
unsigned long long __y, unsigned long long *__p) {
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
}
#endif
#if defined(__cplusplus)
}
#endif
#undef __INLINE
#undef __DEFAULT_FN_ATTRS
#endif /* __ADXINTRIN_H */

183
third_party/intel/clang/ammintrin.h vendored Normal file
View file

@ -0,0 +1,183 @@
/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __AMMINTRIN_H
#define __AMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include "pmmintrin.h"
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
/// Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
/// \endcode
///
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param x
/// The value from which bits are extracted.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
/// [63:0] of parameter \a x are extracted. If the length is zero but the
/// index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
/// extracted from the source operand.
#define _mm_extracti_si64(x, len, idx) \
((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
(char)(len), (char)(idx)))
/// Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index and of the length specified by
/// \a __y.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
///
/// \param __x
/// The value from which bits are extracted.
/// \param __y
/// Specifies the index of the least significant bit at [13:8] and the
/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
/// length is interpreted as 64. If the sum of the index and length is
/// greater than 64, the result is undefined. If the length and index are
/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
/// from the source operand.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_extract_si64(__m128i __x, __m128i __y)
{
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
}
/// Inserts bits of a specified length from the source integer vector
/// \a y into the lower 64 bits of the destination integer vector \a x at
/// the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
/// const int idx);
/// \endcode
///
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length \a len and by the index \a idx specifying the
/// least significant bit.
/// \param y
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand \a y of length \a len.
/// \param len
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
/// are zero, the length is interpreted as 64.
/// \param idx
/// Bits [5:0] specify the index of the least significant bit; the other
/// bits are ignored. If the sum of the index and length is greater than 64,
/// the result is undefined. If the length and index are both zero, bits
/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
/// is zero but the index is non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
/// destination operand \a x with the specified bitfields replaced by the
/// lower bits of source operand \a y. The upper 64 bits of the return value
/// are undefined.
#define _mm_inserti_si64(x, y, len, idx) \
((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
(__v2di)(__m128i)(y), \
(char)(len), (char)(idx)))
/// Inserts bits of a specified length from the source integer vector
/// \a __y into the lower 64 bits of the destination integer vector \a __x
/// at the index and of the length specified by \a __y.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
///
/// \param __x
/// The destination operand where bits will be inserted. The inserted bits
/// are defined by the length and by the index of the least significant bit
/// specified by operand \a __y.
/// \param __y
/// The source operand containing the bits to be extracted. The extracted
/// bits are the least significant bits of operand \a __y with length
/// specified by bits [69:64]. These are inserted into the destination at the
/// index specified by bits [77:72]; all other bits are ignored. If bits
/// [69:64] are zero, the length is interpreted as 64. If the sum of the
/// index and length is greater than 64, the result is undefined. If the
/// length and index are both zero, bits [63:0] of parameter \a __y are
/// inserted into parameter \a __x. If the length is zero but the index is
/// non-zero, the result is undefined.
/// \returns A 128-bit integer vector containing the original lower 64-bits of
/// destination operand \a __x with the specified bitfields replaced by the
/// lower bits of source operand \a __y. The upper 64 bits of the return
/// value are undefined.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_insert_si64(__m128i __x, __m128i __y)
{
return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
}
/// Stores a 64-bit double-precision value in a 64-bit memory location.
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
///
/// \param __p
/// The 64-bit memory location used to store the register value.
/// \param __a
/// The 64-bit double-precision floating-point register value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_sd(void *__p, __m128d __a)
{
__builtin_ia32_movntsd((double *)__p, (__v2df)__a);
}
/// Stores a 32-bit single-precision floating-point value in a 32-bit
/// memory location. To minimize caching, the data is flagged as
/// non-temporal (unlikely to be used again soon).
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
///
/// \param __p
/// The 32-bit memory location used to store the register value.
/// \param __a
/// The 32-bit single-precision floating-point register value to be stored.
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ss(void *__p, __m128 __a)
{
__builtin_ia32_movntss((float *)__p, (__v4sf)__a);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __AMMINTRIN_H */

View file

@ -0,0 +1,169 @@
/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===------------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
#endif // __IMMINTRIN_H
#ifndef __AMX_COMPLEXINTRIN_H
#define __AMX_COMPLEXINTRIN_H
#ifdef __x86_64__
#define __DEFAULT_FN_ATTRS_COMPLEX \
__attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
/// Perform matrix multiplication of two tiles containing complex elements and
/// accumulate the results into a packed single precision tile. Each dword
/// element in input tiles \a a and \a b is interpreted as a complex number
/// with FP16 real part and FP16 imaginary part.
/// Calculates the imaginary part of the result. For each possible combination
/// of (row of \a a, column of \a b), it performs a set of multiplication
/// and accumulations on all corresponding complex numbers (one from \a a
/// and one from \a b). The imaginary part of the \a a element is multiplied
/// with the real part of the corresponding \a b element, and the real part
/// of the \a a element is multiplied with the imaginary part of the
/// corresponding \a b elements. The two accumulated results are added, and
/// then accumulated into the corresponding row and column of \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
/// tmp := dst.row[m]
/// FOR k := 0 TO (a.colsb / 4) - 1
/// FOR n := 0 TO (dst.colsb / 4) - 1
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
/// ENDFOR
/// ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// ENDFOR
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param a
/// The 1st source tile. Max size is 1024 Bytes.
/// \param b
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
/// Perform matrix multiplication of two tiles containing complex elements and
/// accumulate the results into a packed single precision tile. Each dword
/// element in input tiles \a a and \a b is interpreted as a complex number
/// with FP16 real part and FP16 imaginary part.
/// Calculates the real part of the result. For each possible combination
/// of (row of \a a, column of \a b), it performs a set of multiplication
/// and accumulations on all corresponding complex numbers (one from \a a
/// and one from \a b). The real part of the \a a element is multiplied
/// with the real part of the corresponding \a b element, and the negated
/// imaginary part of the \a a element is multiplied with the imaginary
/// part of the corresponding \a b elements. The two accumulated results
/// are added, and then accumulated into the corresponding row and column
/// of \a dst.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
/// tmp := dst.row[m]
/// FOR k := 0 TO (a.colsb / 4) - 1
/// FOR n := 0 TO (dst.colsb / 4) - 1
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
/// ENDFOR
/// ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// ENDFOR
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param a
/// The 1st source tile. Max size is 1024 Bytes.
/// \param b
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
}
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
}
/// Perform matrix multiplication of two tiles containing complex elements and
/// accumulate the results into a packed single precision tile. Each dword
/// element in input tiles src0 and src1 is interpreted as a complex number with
/// FP16 real part and FP16 imaginary part.
/// This function calculates the imaginary part of the result.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_COMPLEX
static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
dst->tile, src0.tile, src1.tile);
}
/// Perform matrix multiplication of two tiles containing complex elements and
/// accumulate the results into a packed single precision tile. Each dword
/// element in input tiles src0 and src1 is interpreted as a complex number with
/// FP16 real part and FP16 imaginary part.
/// This function calculates the real part of the result.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_COMPLEX
static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
dst->tile, src0.tile, src1.tile);
}
#endif // __x86_64__
#endif // __AMX_COMPLEXINTRIN_H

58
third_party/intel/clang/amxfp16intrin.h vendored Normal file
View file

@ -0,0 +1,58 @@
/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===------------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
#endif /* __IMMINTRIN_H */
#ifndef __AMX_FP16INTRIN_H
#define __AMX_FP16INTRIN_H
#ifdef __x86_64__
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
/// and \a b, accumulating the intermediate single-precision (32-bit)
/// floating-point elements with elements in \a dst, and store the 32-bit
/// result back to tile \a dst.
///
/// \headerfile <immintrin.h>
///
/// \code
/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
/// \endcode
///
/// \code{.operation}
/// FOR m := 0 TO dst.rows - 1
/// tmp := dst.row[m]
/// FOR k := 0 TO (a.colsb / 4) - 1
/// FOR n := 0 TO (dst.colsb / 4) - 1
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
/// FP32(b.row[k].fp16[2*n+0])
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
/// FP32(b.row[k].fp16[2*n+1])
/// ENDFOR
/// ENDFOR
/// write_row_and_zero(dst, m, tmp, dst.colsb)
/// ENDFOR
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
/// \endcode
///
/// This intrinsic corresponds to the \c TDPFP16PS instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param a
/// The 1st source tile. Max size is 1024 Bytes.
/// \param b
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpfp16ps(dst, a, b) \
__builtin_ia32_tdpfp16ps(dst, a, b)
#endif /* __x86_64__ */
#endif /* __AMX_FP16INTRIN_H */

524
third_party/intel/clang/amxintrin.h vendored Normal file
View file

@ -0,0 +1,524 @@
/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===------------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
#endif /* __IMMINTRIN_H */
#ifndef __AMXINTRIN_H
#define __AMXINTRIN_H
#ifdef __x86_64__
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS_TILE \
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
#define __DEFAULT_FN_ATTRS_INT8 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
#define __DEFAULT_FN_ATTRS_BF16 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
#define __DEFAULT_FN_ATTRS_FP16 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
/// Load tile configuration from a 64-byte memory location specified by
/// "mem_addr". The tile configuration includes the tile type palette, the
/// number of bytes per row, and the number of rows. If the specified
/// palette_id is zero, that signifies the init state for both the tile
/// config and the tile data, and the tiles are zeroed. Any invalid
/// configurations will result in #GP fault.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
///
/// \param __config
/// A pointer to 512-bits configuration
static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_loadconfig(const void *__config) {
__builtin_ia32_tile_loadconfig(__config);
}
/// Stores the current tile configuration to a 64-byte memory location
/// specified by "mem_addr". The tile configuration includes the tile type
/// palette, the number of bytes per row, and the number of rows. If tiles
/// are not configured, all zeroes will be stored to memory.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
///
/// \param __config
/// A pointer to 512-bits configuration
static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_storeconfig(void *__config) {
__builtin_ia32_tile_storeconfig(__config);
}
/// Release the tile configuration to return to the init state, which
/// releases all storage it currently holds.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
__builtin_ia32_tilerelease();
}
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst" using the tile configuration previously configured
/// via "_tile_loadconfig".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
#define _tile_loadd(dst, base, stride) \
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
(__SIZE_TYPE__)(stride))
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst" using the tile configuration previously configured
/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
/// that the data will likely not be reused in the near future and the data
/// caching can be optimized accordingly.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
#define _tile_stream_loadd(dst, base, stride) \
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
(__SIZE_TYPE__)(stride))
/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride" using the tile configuration previously configured via
/// "_tile_loadconfig".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be stored in memory.
#define _tile_stored(dst, base, stride) \
__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
/// Zero the tile specified by "tdest".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
///
/// \param tile
/// The destination tile to be zero. Max size is 1024 Bytes.
#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbssd(dst, src0, src1) \
__builtin_ia32_tdpbssd((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbsud(dst, src0, src1) \
__builtin_ia32_tdpbsud((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbusd(dst, src0, src1) \
__builtin_ia32_tdpbusd((dst), (src0), (src1))
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
/// "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbuud(dst, src0, src1) \
__builtin_ia32_tdpbuud((dst), (src0), (src1))
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
#define _tile_dpbf16ps(dst, src0, src1) \
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
/// AMX tile register size can be configured, the maximum size is 16x64=1024
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
/// represent 2D tile and the fixed size is maximum amx tile register size.
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
__SIZE_TYPE__ stride) {
return __builtin_ia32_tileloadd64_internal(m, n, base,
(__SIZE_TYPE__)(stride));
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
__SIZE_TYPE__ stride) {
return __builtin_ia32_tileloaddt164_internal(m, n, base,
(__SIZE_TYPE__)(stride));
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
__SIZE_TYPE__ stride, _tile1024i tile) {
return __builtin_ia32_tilestored64_internal(m, n, base,
(__SIZE_TYPE__)(stride), tile);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
}
/// This struct pack the shape and tile data together for user. We suggest
/// initializing the struct as early as possible, because compiler depends
/// on the shape information to do configure. The constant value is preferred
/// for optimization by compiler.
typedef struct __tile1024i_str {
const unsigned short row;
const unsigned short col;
_tile1024i tile;
} __tile1024i;
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
__SIZE_TYPE__ stride) {
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
}
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst". This intrinsic provides a hint to the implementation
/// that the data will likely not be reused in the near future and the data
/// caching can be optimized accordingly.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
__SIZE_TYPE__ stride) {
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
/// "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
///
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be stored in memory.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
__tile1024i src) {
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
}
/// Zero the tile specified by "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
///
/// \param dst
/// The destination tile to be zero. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_TILE
static __inline__ void __tile_zero(__tile1024i *dst) {
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
}
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_BF16
static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_FP16
static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
#undef __DEFAULT_FN_ATTRS_TILE
#undef __DEFAULT_FN_ATTRS_INT8
#undef __DEFAULT_FN_ATTRS_BF16
#undef __DEFAULT_FN_ATTRS_FP16
#endif /* __x86_64__ */
#endif /* __AMXINTRIN_H */

5284
third_party/intel/clang/avx2intrin.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,283 @@
/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
#endif
#ifdef __SSE2__
#ifndef __AVX512BF16INTRIN_H
#define __AVX512BF16INTRIN_H
typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
#define __DEFAULT_FN_ATTRS512 \
__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
__min_vector_width__(512)))
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512bf16,no-evex512")))
/// Convert One BF16 Data to One Single Float Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic does not correspond to a specific instruction.
///
/// \param __A
/// A bfloat data.
/// \returns A float data whose sign field and exponent field keep unchanged,
/// and fraction field is extended to 23 bits.
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
return __builtin_ia32_cvtsbf162ss_32(__A);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __B
/// A 512-bit vector of [16 x float].
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
(__v16sf) __B);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __B
/// A 512-bit vector of [16 x float].
/// \param __W
/// A 512-bit vector of [32 x bfloat].
/// \param __U
/// A 32-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element from __W.
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
(__v32bf)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __B
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 32-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element is zero.
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
/// conversion of __B, and higher 256 bits come from conversion of __A.
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
(__v32bf)_mm512_setzero_si512());
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_cvtneps_pbh(__m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16bf)_mm256_undefined_si256(),
(__mmask16)-1);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __W
/// A 256-bit vector of [16 x bfloat].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element from __W.
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16bf)__W,
(__mmask16)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element is zero.
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
(__v16bf)_mm256_setzero_si256(),
(__mmask16)__U);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 512-bit vector of [32 x bfloat].
/// \param __B
/// A 512-bit vector of [32 x bfloat].
/// \param __D
/// A 512-bit vector of [16 x float].
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
(__v32bf) __A,
(__v32bf) __B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 512-bit vector of [32 x bfloat].
/// \param __B
/// A 512-bit vector of [32 x bfloat].
/// \param __D
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
(__v16sf)__D);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 512-bit vector of [32 x bfloat].
/// \param __B
/// A 512-bit vector of [32 x bfloat].
/// \param __D
/// A 512-bit vector of [16 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
(__v16sf)_mm512_setzero_si512());
}
/// Convert Packed BF16 Data to Packed float Data.
///
/// \headerfile <x86intrin.h>
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __U
/// A 16-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
(__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __S
/// A 512-bit vector of [16 x float]. Elements are copied from __S when
/// the corresponding mask bit is not set.
/// \param __U
/// A 16-bit mask.
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
(__m512i)__S, (__mmask16)__U,
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS512
#endif
#endif

View file

@ -0,0 +1,86 @@
/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512BITALGINTRIN_H
#define __AVX512BITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512bitalg,evex512"), \
__min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_popcnt_epi16(__m512i __A)
{
return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
{
return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
(__v32hi) _mm512_popcnt_epi16(__B),
(__v32hi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
{
return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
__U,
__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_popcnt_epi8(__m512i __A)
{
return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
{
return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
(__v64qi) _mm512_popcnt_epi8(__B),
(__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
{
return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
__U,
__B);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
{
return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
(__v64qi) __B,
__U);
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
{
return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
__A,
__B);
}
#undef __DEFAULT_FN_ATTRS
#endif

2014
third_party/intel/clang/avx512bwintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

125
third_party/intel/clang/avx512cdintrin.h vendored Normal file
View file

@ -0,0 +1,125 @@
/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512CDINTRIN_H
#define __AVX512CDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512cd,evex512"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_conflict_epi64(__A),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_conflict_epi64(__A),
(__v8di)_mm512_setzero_si512 ());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_conflict_epi32(__A),
(__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_conflict_epi32(__A),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_lzcnt_epi32(__A),
(__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_lzcnt_epi32(__A),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_lzcnt_epi64(__A),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_lzcnt_epi64(__A),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcastmb_epi64 (__mmask8 __A)
{
return (__m512i) _mm512_set1_epi64((long long) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcastmw_epi32 (__mmask16 __A)
{
return (__m512i) _mm512_set1_epi32((int) __A);
}
#undef __DEFAULT_FN_ATTRS
#endif

1379
third_party/intel/clang/avx512dqintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

271
third_party/intel/clang/avx512erintrin.h vendored Normal file
View file

@ -0,0 +1,271 @@
/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512ERINTRIN_H
#define __AVX512ERINTRIN_H
/* exp2a23 */
#define _mm512_exp2a23_round_pd(A, R) \
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)))
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)))
#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)))
#define _mm512_exp2a23_pd(A) \
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_exp2a23_pd(S, M, A) \
_mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_exp2a23_pd(M, A) \
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_exp2a23_round_ps(A, R) \
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)))
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)))
#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)))
#define _mm512_exp2a23_ps(A) \
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_exp2a23_ps(S, M, A) \
_mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_exp2a23_ps(M, A) \
_mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
/* rsqrt28 */
#define _mm512_rsqrt28_round_pd(A, R) \
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)))
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)))
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)))
#define _mm512_rsqrt28_pd(A) \
_mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rsqrt28_pd(S, M, A) \
_mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rsqrt28_pd(M, A) \
_mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_rsqrt28_round_ps(A, R) \
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)))
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)))
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)))
#define _mm512_rsqrt28_ps(A) \
_mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rsqrt28_ps(S, M, A) \
_mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rsqrt28_ps(M, A) \
_mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_ss(A, B, R) \
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R)))
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R)))
#define _mm_rsqrt28_ss(A, B) \
_mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_mask_rsqrt28_ss(S, M, A, B) \
_mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_rsqrt28_ss(M, A, B) \
_mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_sd(A, B, R) \
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R)))
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R)))
#define _mm_rsqrt28_sd(A, B) \
_mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_mask_rsqrt28_sd(S, M, A, B) \
_mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_rsqrt28_sd(M, A, B) \
_mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
/* rcp28 */
#define _mm512_rcp28_round_pd(A, R) \
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)))
#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)))
#define _mm512_maskz_rcp28_round_pd(M, A, R) \
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)))
#define _mm512_rcp28_pd(A) \
_mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rcp28_pd(S, M, A) \
_mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rcp28_pd(M, A) \
_mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_rcp28_round_ps(A, R) \
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)))
#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)))
#define _mm512_maskz_rcp28_round_ps(M, A, R) \
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)))
#define _mm512_rcp28_ps(A) \
_mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rcp28_ps(S, M, A) \
_mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rcp28_ps(M, A) \
_mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_ss(A, B, R) \
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R)))
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R)))
#define _mm_rcp28_ss(A, B) \
_mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_mask_rcp28_ss(S, M, A, B) \
_mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_rcp28_ss(M, A, B) \
_mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_sd(A, B, R) \
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R)))
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R)))
#define _mm_rcp28_sd(A, B) \
_mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_mask_rcp28_sd(S, M, A, B) \
_mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_rcp28_sd(M, A, B) \
_mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#endif /* __AVX512ERINTRIN_H */

9779
third_party/intel/clang/avx512fintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

3352
third_party/intel/clang/avx512fp16intrin.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,70 @@
/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __IFMAINTRIN_H
#define __IFMAINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512ifma,evex512"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
(__v8di) __Z);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
(__v8di) __Z);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
(__v8di)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS
#endif

View file

@ -0,0 +1,111 @@
/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __IFMAVLINTRIN_H
#define __IFMAVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512ifma,avx512vl,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512ifma,avx512vl,no-evex512"), \
__min_vector_width__(256)))
#define _mm_madd52hi_epu64(X, Y, Z) \
((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y), \
(__v2di)(Z)))
#define _mm256_madd52hi_epu64(X, Y, Z) \
((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y), \
(__v4di)(Z)))
#define _mm_madd52lo_epu64(X, Y, Z) \
((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y), \
(__v2di)(Z)))
#define _mm256_madd52lo_epu64(X, Y, Z) \
((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y), \
(__v4di)(Z)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
(__v4di)_mm256_setzero_si256());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View file

@ -0,0 +1,92 @@
/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512PFINTRIN_H
#define __AVX512PFINTRIN_H
#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
(void const *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
__builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
(void const *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfdps((__mmask16)(mask), \
(__v16si)(__m512i)(index), (void const *)(addr), \
(int)(scale), (int)(hint))
#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
__builtin_ia32_gatherpfdps((__mmask16) -1, \
(__v16si)(__m512i)(index), (void const *)(addr), \
(int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
(void const *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
__builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
(void const *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
(void const *)(addr), (int)(scale), (int)(hint))
#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
__builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
(void const *)(addr), (int)(scale), (int)(hint))
#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
__builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
(void *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
(void *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
__builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
(void *)(addr), (int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfdps((__mmask16)(mask), \
(__v16si)(__m512i)(index), (void *)(addr), \
(int)(scale), (int)(hint))
#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
__builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
(void *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
(void *)(addr), (int)(scale), \
(int)(hint))
#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
__builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
(void *)(addr), (int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
(void *)(addr), (int)(scale), (int)(hint))
#endif

View file

@ -0,0 +1,357 @@
/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VBMI2INTRIN_H
#define __AVX512VBMI2INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2,evex512"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
(__v32hi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
(__v32hi) _mm512_setzero_si512(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
(__v64qi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
(__v64qi) _mm512_setzero_si512(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
{
__builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
{
__builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
(__v32hi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
(__v32hi) _mm512_setzero_si512(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
(__v64qi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
(__v64qi) _mm512_setzero_si512(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
(__v32hi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
(__v32hi) _mm512_setzero_si512(),
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
(__v64qi) __S,
__U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
(__v64qi) _mm512_setzero_si512(),
__U);
}
#define _mm512_shldi_epi64(A, B, I) \
((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I)))
#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
(__v8di)(__m512i)(S)))
#define _mm512_maskz_shldi_epi64(U, A, B, I) \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
(__v8di)_mm512_setzero_si512()))
#define _mm512_shldi_epi32(A, B, I) \
((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I)))
#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
(__v16si)(__m512i)(S)))
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
(__v16si)_mm512_setzero_si512()))
#define _mm512_shldi_epi16(A, B, I) \
((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B), (int)(I)))
#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
(__v32hi)(__m512i)(S)))
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
(__v32hi)_mm512_setzero_si512()))
#define _mm512_shrdi_epi64(A, B, I) \
((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I)))
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
(__v8di)(__m512i)(S)))
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
(__v8di)_mm512_setzero_si512()))
#define _mm512_shrdi_epi32(A, B, I) \
((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I)))
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
(__v16si)(__m512i)(S)))
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
(__v16si)_mm512_setzero_si512()))
#define _mm512_shrdi_epi16(A, B, I) \
((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B), (int)(I)))
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
(__v32hi)(__m512i)(S)))
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
(__v32hi)_mm512_setzero_si512()))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B,
(__v8di)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectq_512(__U,
(__v8di)_mm512_shldv_epi64(__A, __B, __C),
(__v8di)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectq_512(__U,
(__v8di)_mm512_shldv_epi64(__A, __B, __C),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_shldv_epi32(__A, __B, __C),
(__v16si)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_shldv_epi32(__A, __B, __C),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B,
(__v32hi)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectw_512(__U,
(__v32hi)_mm512_shldv_epi16(__A, __B, __C),
(__v32hi)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectw_512(__U,
(__v32hi)_mm512_shldv_epi16(__A, __B, __C),
(__v32hi)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B,
(__v8di)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectq_512(__U,
(__v8di)_mm512_shrdv_epi64(__A, __B, __C),
(__v8di)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectq_512(__U,
(__v8di)_mm512_shrdv_epi64(__A, __B, __C),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
{
return (__m512i) __builtin_ia32_selectd_512(__U,
(__v16si)_mm512_shrdv_epi32(__A, __B, __C),
(__v16si)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i) __builtin_ia32_selectd_512(__U,
(__v16si)_mm512_shrdv_epi32(__A, __B, __C),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B,
(__v32hi)__C);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectw_512(__U,
(__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
(__v32hi)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
{
return (__m512i)__builtin_ia32_selectw_512(__U,
(__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
(__v32hi)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS
#endif

View file

@ -0,0 +1,106 @@
/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __VBMIINTRIN_H
#define __VBMIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vbmi,evex512"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
{
return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
(__v64qi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
__m512i __B)
{
return (__m512i)__builtin_ia32_selectb_512(__U,
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
(__v64qi)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
__m512i __B)
{
return (__m512i)__builtin_ia32_selectb_512(__U,
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
(__v64qi)__I);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
__m512i __B)
{
return (__m512i)__builtin_ia32_selectb_512(__U,
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
(__v64qi)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
__m512i __B)
{
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_permutexvar_epi8(__A, __B),
(__v64qi)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
__m512i __B)
{
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_permutexvar_epi8(__A, __B),
(__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y)
{
return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X,
__m512i __Y)
{
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
(__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
{
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
(__v64qi)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS
#endif

View file

@ -0,0 +1,193 @@
/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __VBMIVLINTRIN_H
#define __VBMIVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vbmi,avx512vl,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vbmi,avx512vl,no-evex512"), \
__min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
{
return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
(__v16qi)__I,
(__v16qi)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
__m128i __B)
{
return (__m128i)__builtin_ia32_selectb_128(__U,
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
(__v16qi)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
__m128i __B)
{
return (__m128i)__builtin_ia32_selectb_128(__U,
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
(__v16qi)__I);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
__m128i __B)
{
return (__m128i)__builtin_ia32_selectb_128(__U,
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
(__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
{
return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
(__v32qi)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
__m256i __B)
{
return (__m256i)__builtin_ia32_selectb_256(__U,
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
(__v32qi)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
__m256i __B)
{
return (__m256i)__builtin_ia32_selectb_256(__U,
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
(__v32qi)__I);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
__m256i __B)
{
return (__m256i)__builtin_ia32_selectb_256(__U,
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
(__v32qi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
(__v16qi)_mm_permutexvar_epi8(__A, __B),
(__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
__m128i __B)
{
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
(__v16qi)_mm_permutexvar_epi8(__A, __B),
(__v16qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
__m256i __B)
{
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
(__v32qi)_mm256_permutexvar_epi8(__A, __B),
(__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
__m256i __B)
{
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
(__v32qi)_mm256_permutexvar_epi8(__A, __B),
(__v32qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X,
__m128i __Y)
{
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
(__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
(__v16qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y)
{
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
(__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
(__v16qi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X,
__m256i __Y)
{
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
(__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
(__v32qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
{
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
(__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
(__v32qi)_mm256_setzero_si256());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View file

@ -0,0 +1,517 @@
/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
#endif
#ifdef __SSE2__
#ifndef __AVX512VLBF16INTRIN_H
#define __AVX512VLBF16INTRIN_H
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512bf16,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512bf16,no-evex512"), \
__min_vector_width__(256)))
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __B
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
(__v4sf) __B);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __B
/// A 128-bit vector of [4 x float].
/// \param __W
/// A 128-bit vector of [8 x bfloat].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element from __W.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
(__v8bf)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __B
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element is zero.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __B, and higher 64 bits come from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
(__v8bf)_mm_setzero_si128());
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __B
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
(__v8sf) __B);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __B
/// A 256-bit vector of [8 x float].
/// \param __W
/// A 256-bit vector of [16 x bfloat].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element from __W.
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
(__v16bf)__W);
}
/// Convert Two Packed Single Data to One Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __B
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A or __B. A 0 means element is zero.
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
/// conversion of __B, and higher 128 bits come from conversion of __A.
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
(__v16bf)_mm256_setzero_si256());
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __A, and higher 64 bits are 0.
#define _mm_cvtneps_pbh(A) \
((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __W
/// A 128-bit vector of [8 x bfloat].
/// \param __U
/// A 4-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element from __W.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __A, and higher 64 bits are 0.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
(__v8bf)__W,
(__mmask8)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 4-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element is zero.
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
/// conversion of __A, and higher 64 bits are 0.
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
(__v8bf)_mm_setzero_si128(),
(__mmask8)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
#define _mm256_cvtneps_pbh(A) \
((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __W
/// A 256-bit vector of [8 x bfloat].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element from __W.
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
(__v8bf)__W,
(__mmask8)__U);
}
/// Convert Packed Single Data to Packed BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means conversion of __A. A 0 means element is zero.
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
(__v8bf)_mm_setzero_si128(),
(__mmask8)__U);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \param __B
/// A 128-bit vector of [8 x bfloat].
/// \param __D
/// A 128-bit vector of [4 x float].
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
(__v8bf)__A,
(__v8bf)__B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \param __B
/// A 128-bit vector of [8 x bfloat].
/// \param __D
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
(__v4sf)__D);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \param __B
/// A 128-bit vector of [8 x bfloat].
/// \param __D
/// A 128-bit vector of [4 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
(__v4sf)_mm_setzero_si128());
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \param __B
/// A 256-bit vector of [16 x bfloat].
/// \param __D
/// A 256-bit vector of [8 x float].
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
(__v16bf)__A,
(__v16bf)__B);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \param __B
/// A 256-bit vector of [16 x bfloat].
/// \param __D
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 16-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
(__v8sf)__D);
}
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \param __B
/// A 256-bit vector of [16 x bfloat].
/// \param __D
/// A 256-bit vector of [8 x float].
/// \param __U
/// A 8-bit mask value specifying what is chosen for each element.
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
/// __A, __B and __D
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
(__v8sf)_mm256_setzero_si256());
}
/// Convert One Single float Data to One BF16 Data.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
///
/// \param __A
/// A float data.
/// \returns A bf16 data whose sign field and exponent field keep unchanged,
/// and fraction field is truncated to 7 bits.
static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
__v4sf __V = {__A, 0, 0, 0};
__v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
(__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
return (__bf16)__R[0];
}
/// Convert Packed BF16 Data to Packed float Data.
///
/// \headerfile <x86intrin.h>
///
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
return _mm_castsi128_ps(
(__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data.
///
/// \headerfile <x86intrin.h>
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
(__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __U
/// A 4-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
(__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __U
/// A 8-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
(__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __S
/// A 128-bit vector of [4 x float]. Elements are copied from __S when
/// the corresponding mask bit is not set.
/// \param __U
/// A 4-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
(__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
16));
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __S
/// A 256-bit vector of [8 x float]. Elements are copied from __S when
/// the corresponding mask bit is not set.
/// \param __U
/// A 8-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
(__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
16));
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif
#endif

View file

@ -0,0 +1,151 @@
/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLBITALGINTRIN_H
#define __AVX512VLBITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512bitalg,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512bitalg,no-evex512"), \
__min_vector_width__(256)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi16(__m256i __A)
{
return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
{
return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
(__v16hi) _mm256_popcnt_epi16(__B),
(__v16hi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
{
return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
__U,
__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_popcnt_epi16(__m128i __A)
{
return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
{
return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
(__v8hi) _mm_popcnt_epi16(__B),
(__v8hi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
{
return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
__U,
__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi8(__m256i __A)
{
return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
{
return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
(__v32qi) _mm256_popcnt_epi8(__B),
(__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
{
return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
__U,
__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_popcnt_epi8(__m128i __A)
{
return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
{
return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
(__v16qi) _mm_popcnt_epi8(__B),
(__v16qi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
{
return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
__U,
__B);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
{
return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
(__v32qi) __B,
__U);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
{
return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
__A,
__B);
}
static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
{
return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
(__v16qi) __B,
__U);
}
static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
{
return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
__A,
__B);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

3167
third_party/intel/clang/avx512vlbwintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,230 @@
/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLCDINTRIN_H
#define __AVX512VLCDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512cd,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512cd,no-evex512"), \
__min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_broadcastmb_epi64 (__mmask8 __A)
{
return (__m128i) _mm_set1_epi64x((long long) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastmb_epi64 (__mmask8 __A)
{
return (__m256i) _mm256_set1_epi64x((long long)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_broadcastmw_epi32 (__mmask16 __A)
{
return (__m128i) _mm_set1_epi32((int)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastmw_epi32 (__mmask16 __A)
{
return (__m256i) _mm256_set1_epi32((int)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_conflict_epi64 (__m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_conflict_epi64(__A),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_conflict_epi64(__A),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_conflict_epi64 (__m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_conflict_epi64(__A),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_conflict_epi64(__A),
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_conflict_epi32 (__m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_conflict_epi32(__A),
(__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_conflict_epi32(__A),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_conflict_epi32 (__m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_conflict_epi32(__A),
(__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_conflict_epi32(__A),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_lzcnt_epi32 (__m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_lzcnt_epi32(__A),
(__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_lzcnt_epi32(__A),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_lzcnt_epi32 (__m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_lzcnt_epi32(__A),
(__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_lzcnt_epi32(__A),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_lzcnt_epi64 (__m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_lzcnt_epi64(__A),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
{
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_lzcnt_epi64(__A),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_lzcnt_epi64 (__m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_lzcnt_epi64(__A),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
{
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_lzcnt_epi64(__A),
(__v4di)_mm256_setzero_si256());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif /* __AVX512VLCDINTRIN_H */

1173
third_party/intel/clang/avx512vldqintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

8437
third_party/intel/clang/avx512vlintrin.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,695 @@
/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLVBMI2INTRIN_H
#define __AVX512VLVBMI2INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vbmi2,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vbmi2,no-evex512"), \
__min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
(__v8hi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
(__v16qi) _mm_setzero_si128(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
{
__builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
{
__builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
(__v8hi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
(__v16qi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
(__v8hi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
(__v16qi) _mm_setzero_si128(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
(__v16hi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
(__v16hi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
(__v32qi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
(__v32qi) _mm256_setzero_si256(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
{
__builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
{
__builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
(__v16hi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
(__v16hi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
(__v32qi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
(__v32qi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
(__v16hi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
(__v16hi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
(__v32qi) __S,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
(__v32qi) _mm256_setzero_si256(),
__U);
}
#define _mm256_shldi_epi64(A, B, I) \
((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(I)))
#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
(__v4di)(__m256i)(S)))
#define _mm256_maskz_shldi_epi64(U, A, B, I) \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
(__v4di)_mm256_setzero_si256()))
#define _mm_shldi_epi64(A, B, I) \
((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(I)))
#define _mm_mask_shldi_epi64(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
(__v2di)(__m128i)(S)))
#define _mm_maskz_shldi_epi64(U, A, B, I) \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
(__v2di)_mm_setzero_si128()))
#define _mm256_shldi_epi32(A, B, I) \
((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(I)))
#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
(__v8si)(__m256i)(S)))
#define _mm256_maskz_shldi_epi32(U, A, B, I) \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
(__v8si)_mm256_setzero_si256()))
#define _mm_shldi_epi32(A, B, I) \
((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(I)))
#define _mm_mask_shldi_epi32(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
(__v4si)(__m128i)(S)))
#define _mm_maskz_shldi_epi32(U, A, B, I) \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
(__v4si)_mm_setzero_si128()))
#define _mm256_shldi_epi16(A, B, I) \
((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B), (int)(I)))
#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
(__v16hi)(__m256i)(S)))
#define _mm256_maskz_shldi_epi16(U, A, B, I) \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
(__v16hi)_mm256_setzero_si256()))
#define _mm_shldi_epi16(A, B, I) \
((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (int)(I)))
#define _mm_mask_shldi_epi16(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
(__v8hi)(__m128i)(S)))
#define _mm_maskz_shldi_epi16(U, A, B, I) \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
(__v8hi)_mm_setzero_si128()))
#define _mm256_shrdi_epi64(A, B, I) \
((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(I)))
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
(__v4di)(__m256i)(S)))
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
(__v4di)_mm256_setzero_si256()))
#define _mm_shrdi_epi64(A, B, I) \
((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(I)))
#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
(__v2di)(__m128i)(S)))
#define _mm_maskz_shrdi_epi64(U, A, B, I) \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
(__v2di)_mm_setzero_si128()))
#define _mm256_shrdi_epi32(A, B, I) \
((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(I)))
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
(__v8si)(__m256i)(S)))
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
(__v8si)_mm256_setzero_si256()))
#define _mm_shrdi_epi32(A, B, I) \
((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(I)))
#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
(__v4si)(__m128i)(S)))
#define _mm_maskz_shrdi_epi32(U, A, B, I) \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
(__v4si)_mm_setzero_si128()))
#define _mm256_shrdi_epi16(A, B, I) \
((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B), (int)(I)))
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
(__v16hi)(__m256i)(S)))
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
(__v16hi)_mm256_setzero_si256()))
#define _mm_shrdi_epi16(A, B, I) \
((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (int)(I)))
#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
(__v8hi)(__m128i)(S)))
#define _mm_maskz_shrdi_epi16(U, A, B, I) \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
(__v8hi)_mm_setzero_si128()))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B,
(__v4di)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectq_256(__U,
(__v4di)_mm256_shldv_epi64(__A, __B, __C),
(__v4di)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectq_256(__U,
(__v4di)_mm256_shldv_epi64(__A, __B, __C),
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B,
(__v2di)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectq_128(__U,
(__v2di)_mm_shldv_epi64(__A, __B, __C),
(__v2di)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectq_128(__U,
(__v2di)_mm_shldv_epi64(__A, __B, __C),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B,
(__v8si)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_shldv_epi32(__A, __B, __C),
(__v8si)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_shldv_epi32(__A, __B, __C),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_shldv_epi32(__A, __B, __C),
(__v4si)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_shldv_epi32(__A, __B, __C),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B,
(__v16hi)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectw_256(__U,
(__v16hi)_mm256_shldv_epi16(__A, __B, __C),
(__v16hi)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectw_256(__U,
(__v16hi)_mm256_shldv_epi16(__A, __B, __C),
(__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B,
(__v8hi)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectw_128(__U,
(__v8hi)_mm_shldv_epi16(__A, __B, __C),
(__v8hi)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectw_128(__U,
(__v8hi)_mm_shldv_epi16(__A, __B, __C),
(__v8hi)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B,
(__v4di)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectq_256(__U,
(__v4di)_mm256_shrdv_epi64(__A, __B, __C),
(__v4di)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectq_256(__U,
(__v4di)_mm256_shrdv_epi64(__A, __B, __C),
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B,
(__v2di)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectq_128(__U,
(__v2di)_mm_shrdv_epi64(__A, __B, __C),
(__v2di)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectq_128(__U,
(__v2di)_mm_shrdv_epi64(__A, __B, __C),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B,
(__v8si)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_shrdv_epi32(__A, __B, __C),
(__v8si)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_shrdv_epi32(__A, __B, __C),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_shrdv_epi32(__A, __B, __C),
(__v4si)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_shrdv_epi32(__A, __B, __C),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B,
(__v16hi)__C);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectw_256(__U,
(__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
(__v16hi)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)__builtin_ia32_selectw_256(__U,
(__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
(__v16hi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B,
(__v8hi)__C);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectw_128(__U,
(__v8hi)_mm_shrdv_epi16(__A, __B, __C),
(__v8hi)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
{
return (__m128i)__builtin_ia32_selectw_128(__U,
(__v8hi)_mm_shrdv_epi16(__A, __B, __C),
(__v8hi)_mm_setzero_si128());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View file

@ -0,0 +1,310 @@
/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VLVNNIINTRIN_H
#define __AVX512VLVNNIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vnni,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vnni,no-evex512"), \
__min_vector_width__(256)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
#define _mm256_dpbusd_epi32(S, A, B) \
((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
#define _mm256_dpbusds_epi32(S, A, B) \
((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
/// and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
#define _mm256_dpwssd_epi32(S, A, B) \
((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
/// using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// DST[MAX:256] := 0
/// \endcode
#define _mm256_dpwssds_epi32(S, A, B) \
((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
#define _mm_dpbusd_epi32(S, A, B) \
((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
#define _mm_dpbusds_epi32(S, A, B) \
((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
/// and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
#define _mm_dpwssd_epi32(S, A, B) \
((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
/// using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
///
/// \code{.operation}
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// DST[MAX:128] := 0
/// \endcode
#define _mm_dpwssds_epi32(S, A, B) \
((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View file

@ -0,0 +1,123 @@
/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vlvp2intersectintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512VLVP2INTERSECT_H
#define _AVX512VLVP2INTERSECT_H
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vp2intersect,no-evex512"), \
__min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512vp2intersect,no-evex512"), \
__min_vector_width__(256)))
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between dwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
///
/// \param __a
/// A 256-bit vector of [8 x i32].
/// \param __b
/// A 256-bit vector of [8 x i32]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between quadwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
///
/// \param __a
/// A 256-bit vector of [4 x i64].
/// \param __b
/// A 256-bit vector of [4 x i64]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between dwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
///
/// \param __a
/// A 128-bit vector of [4 x i32].
/// \param __b
/// A 128-bit vector of [4 x i32]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1);
}
/// Store, in an even/odd pair of mask registers, the indicators of the
/// locations of value matches between quadwords in operands __a and __b.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
///
/// \param __a
/// A 128-bit vector of [2 x i64].
/// \param __b
/// A 128-bit vector of [2 x i64]
/// \param __m0
/// A pointer point to 8-bit mask
/// \param __m1
/// A pointer point to 8-bit mask
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
__builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View file

@ -0,0 +1,116 @@
/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
*
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VNNIINTRIN_H
#define __AVX512VNNIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vnni,evex512"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS
#endif

Some files were not shown because too many files have changed in this diff Show more