mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-08-03 16:30:29 +00:00
Merge branch 'master' into ctl-unique-test
This commit is contained in:
commit
5757eaf70d
198 changed files with 199788 additions and 647 deletions
4
.vscode/settings.json
vendored
4
.vscode/settings.json
vendored
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"C_Cpp.default.compilerPath": ".cosmocc/3.7.1/bin/aarch64-linux-cosmo-c++",
|
"C_Cpp.default.compilerPath": ".cosmocc/3.8.0/bin/aarch64-linux-cosmo-c++",
|
||||||
"C_Cpp.default.compilerArgs": [
|
"C_Cpp.default.compilerArgs": [
|
||||||
"-nostdinc",
|
"-nostdinc",
|
||||||
"-nostdlib",
|
"-nostdlib",
|
||||||
|
@ -33,4 +33,4 @@
|
||||||
"files.associations": {
|
"files.associations": {
|
||||||
"log.h": "c"
|
"log.h": "c"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -147,10 +147,10 @@ export MODE
|
||||||
export SOURCE_DATE_EPOCH
|
export SOURCE_DATE_EPOCH
|
||||||
export TMPDIR
|
export TMPDIR
|
||||||
|
|
||||||
COSMOCC = .cosmocc/3.7.1
|
COSMOCC = .cosmocc/3.8.0
|
||||||
BOOTSTRAP = $(COSMOCC)/bin
|
BOOTSTRAP = $(COSMOCC)/bin
|
||||||
TOOLCHAIN = $(COSMOCC)/bin/$(ARCH)-linux-cosmo-
|
TOOLCHAIN = $(COSMOCC)/bin/$(ARCH)-linux-cosmo-
|
||||||
DOWNLOAD := $(shell build/download-cosmocc.sh $(COSMOCC) 3.7.1 13b65b0e659b493bd82f3d0a319d0265d66f849839e484aa2a54191024711e85)
|
DOWNLOAD := $(shell build/download-cosmocc.sh $(COSMOCC) 3.8.0 813c6b2f95062d2e0a845307a79505424cb98cb038e8013334f8a22e3b92a474)
|
||||||
|
|
||||||
IGNORE := $(shell $(MKDIR) $(TMPDIR))
|
IGNORE := $(shell $(MKDIR) $(TMPDIR))
|
||||||
|
|
||||||
|
|
|
@ -103,10 +103,8 @@ SECTIONS {
|
||||||
*(.eh_frame_entry .eh_frame_entry.*)
|
*(.eh_frame_entry .eh_frame_entry.*)
|
||||||
}
|
}
|
||||||
|
|
||||||
.eh_frame : ONLY_IF_RO {
|
__eh_frame_hdr_start = SIZEOF(.eh_frame_hdr) > 0 ? ADDR(.eh_frame_hdr) : 0;
|
||||||
KEEP(*(.eh_frame))
|
__eh_frame_hdr_end = SIZEOF(.eh_frame_hdr) > 0 ? . : 0;
|
||||||
*(.eh_frame.*)
|
|
||||||
}
|
|
||||||
|
|
||||||
.gcc_except_table : ONLY_IF_RO {
|
.gcc_except_table : ONLY_IF_RO {
|
||||||
*(.gcc_except_table .gcc_except_table.*)
|
*(.gcc_except_table .gcc_except_table.*)
|
||||||
|
@ -127,9 +125,11 @@ SECTIONS {
|
||||||
. += CONSTANT(MAXPAGESIZE);
|
. += CONSTANT(MAXPAGESIZE);
|
||||||
. = DATA_SEGMENT_ALIGN(CONSTANT(MAXPAGESIZE), CONSTANT(COMMONPAGESIZE));
|
. = DATA_SEGMENT_ALIGN(CONSTANT(MAXPAGESIZE), CONSTANT(COMMONPAGESIZE));
|
||||||
|
|
||||||
.eh_frame : ONLY_IF_RW {
|
.eh_frame : {
|
||||||
|
__eh_frame_start = .;
|
||||||
KEEP(*(.eh_frame))
|
KEEP(*(.eh_frame))
|
||||||
*(.eh_frame.*)
|
*(.eh_frame.*)
|
||||||
|
__eh_frame_end = .;
|
||||||
}
|
}
|
||||||
|
|
||||||
.gnu_extab : ONLY_IF_RW {
|
.gnu_extab : ONLY_IF_RW {
|
||||||
|
|
12
ape/ape.lds
12
ape/ape.lds
|
@ -329,6 +329,10 @@ SECTIONS {
|
||||||
*(.ubsan.types)
|
*(.ubsan.types)
|
||||||
*(.ubsan.data)
|
*(.ubsan.data)
|
||||||
|
|
||||||
|
__eh_frame_hdr_start_actual = .;
|
||||||
|
*(.eh_frame_hdr)
|
||||||
|
__eh_frame_hdr_end_actual = .;
|
||||||
|
|
||||||
/* Legal Notices */
|
/* Legal Notices */
|
||||||
__notices = .;
|
__notices = .;
|
||||||
KEEP(*(.notice))
|
KEEP(*(.notice))
|
||||||
|
@ -422,6 +426,11 @@ SECTIONS {
|
||||||
KEEP(*(.dtors))
|
KEEP(*(.dtors))
|
||||||
__fini_array_end = .;
|
__fini_array_end = .;
|
||||||
|
|
||||||
|
__eh_frame_start = .;
|
||||||
|
KEEP(*(.eh_frame))
|
||||||
|
*(.eh_frame.*)
|
||||||
|
__eh_frame_end = .;
|
||||||
|
|
||||||
/*BEGIN: Post-Initialization Read-Only */
|
/*BEGIN: Post-Initialization Read-Only */
|
||||||
. = ALIGN(. != 0 ? __SIZEOF_POINTER__ : 0);
|
. = ALIGN(. != 0 ? __SIZEOF_POINTER__ : 0);
|
||||||
KEEP(*(SORT_BY_NAME(.piro.relo.sort.*)))
|
KEEP(*(SORT_BY_NAME(.piro.relo.sort.*)))
|
||||||
|
@ -601,6 +610,9 @@ ape_text_memsz = ape_text_filesz;
|
||||||
ape_text_align = CONSTANT(COMMONPAGESIZE);
|
ape_text_align = CONSTANT(COMMONPAGESIZE);
|
||||||
ape_text_rva = RVA(ape_text_vaddr);
|
ape_text_rva = RVA(ape_text_vaddr);
|
||||||
|
|
||||||
|
__eh_frame_hdr_start = __eh_frame_hdr_end_actual > __eh_frame_hdr_start_actual ? __eh_frame_hdr_start_actual : 0;
|
||||||
|
__eh_frame_hdr_end = __eh_frame_hdr_end_actual > __eh_frame_hdr_start_actual ? __eh_frame_hdr_end_actual : 0;
|
||||||
|
|
||||||
/* we roundup here because xnu wants the file load segments page-aligned */
|
/* we roundup here because xnu wants the file load segments page-aligned */
|
||||||
/* but we don't want to add the nop padding to the ape program, so we'll */
|
/* but we don't want to add the nop padding to the ape program, so we'll */
|
||||||
/* let ape.S dd read past the end of the file into the wrapping binaries */
|
/* let ape.S dd read past the end of the file into the wrapping binaries */
|
||||||
|
|
|
@ -92,10 +92,7 @@ DEFAULT_COPTS ?= \
|
||||||
-fno-gnu-unique \
|
-fno-gnu-unique \
|
||||||
-fstrict-aliasing \
|
-fstrict-aliasing \
|
||||||
-fstrict-overflow \
|
-fstrict-overflow \
|
||||||
-fno-semantic-interposition \
|
-fno-semantic-interposition
|
||||||
-fno-dwarf2-cfi-asm \
|
|
||||||
-fno-unwind-tables \
|
|
||||||
-fno-asynchronous-unwind-tables
|
|
||||||
|
|
||||||
ifeq ($(ARCH), x86_64)
|
ifeq ($(ARCH), x86_64)
|
||||||
# Microsoft says "[a]ny memory below the stack beyond the red zone
|
# Microsoft says "[a]ny memory below the stack beyond the red zone
|
||||||
|
@ -139,8 +136,6 @@ DEFAULT_CFLAGS = \
|
||||||
|
|
||||||
DEFAULT_CXXFLAGS = \
|
DEFAULT_CXXFLAGS = \
|
||||||
-std=gnu++23 \
|
-std=gnu++23 \
|
||||||
-fno-rtti \
|
|
||||||
-fno-exceptions \
|
|
||||||
-fuse-cxa-atexit \
|
-fuse-cxa-atexit \
|
||||||
-Wno-int-in-bool-context \
|
-Wno-int-in-bool-context \
|
||||||
-Wno-narrowing \
|
-Wno-narrowing \
|
||||||
|
|
|
@ -6,14 +6,14 @@ if [ -n "$OBJDUMP" ]; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
find_objdump() {
|
find_objdump() {
|
||||||
if [ -x .cosmocc/3.6.0/bin/$1-linux-cosmo-objdump ]; then
|
if [ -x .cosmocc/3.8.0/bin/$1-linux-cosmo-objdump ]; then
|
||||||
OBJDUMP=.cosmocc/3.6.0/bin/$1-linux-cosmo-objdump
|
OBJDUMP=.cosmocc/3.8.0/bin/$1-linux-cosmo-objdump
|
||||||
elif [ -x .cosmocc/3.6.0/bin/$1-linux-musl-objdump ]; then
|
elif [ -x .cosmocc/3.8.0/bin/$1-linux-musl-objdump ]; then
|
||||||
OBJDUMP=.cosmocc/3.6.0/bin/$1-linux-musl-objdump
|
OBJDUMP=.cosmocc/3.8.0/bin/$1-linux-musl-objdump
|
||||||
elif [ -x "$COSMO/.cosmocc/3.6.0/bin/$1-linux-cosmo-objdump" ]; then
|
elif [ -x "$COSMO/.cosmocc/3.8.0/bin/$1-linux-cosmo-objdump" ]; then
|
||||||
OBJDUMP="$COSMO/.cosmocc/3.6.0/bin/$1-linux-cosmo-objdump"
|
OBJDUMP="$COSMO/.cosmocc/3.8.0/bin/$1-linux-cosmo-objdump"
|
||||||
elif [ -x "$COSMO/.cosmocc/3.6.0/bin/$1-linux-musl-objdump" ]; then
|
elif [ -x "$COSMO/.cosmocc/3.8.0/bin/$1-linux-musl-objdump" ]; then
|
||||||
OBJDUMP="$COSMO/.cosmocc/3.6.0/bin/$1-linux-musl-objdump"
|
OBJDUMP="$COSMO/.cosmocc/3.8.0/bin/$1-linux-musl-objdump"
|
||||||
else
|
else
|
||||||
echo "error: toolchain not found (try running 'cosmocc --update' or 'make' in the cosmo monorepo)" >&2
|
echo "error: toolchain not found (try running 'cosmocc --update' or 'make' in the cosmo monorepo)" >&2
|
||||||
exit 1
|
exit 1
|
||||||
|
|
|
@ -17,6 +17,9 @@ struct conditional<false, T, F>
|
||||||
typedef F type;
|
typedef F type;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<bool B, typename T, typename F>
|
||||||
|
using conditional_t = typename conditional<B, T, F>::type;
|
||||||
|
|
||||||
} // namespace ctl
|
} // namespace ctl
|
||||||
|
|
||||||
#endif // CTL_CONDITIONAL_H_
|
#endif // CTL_CONDITIONAL_H_
|
||||||
|
|
|
@ -19,6 +19,9 @@ template<typename _Tp>
|
||||||
struct is_void : public is_void_<typename ctl::remove_cv<_Tp>::type>::type
|
struct is_void : public is_void_<typename ctl::remove_cv<_Tp>::type>::type
|
||||||
{};
|
{};
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
inline constexpr bool is_void_v = is_void<T>::value;
|
||||||
|
|
||||||
} // namespace ctl
|
} // namespace ctl
|
||||||
|
|
||||||
#endif // CTL_IS_VOID_H_
|
#endif // CTL_IS_VOID_H_
|
||||||
|
|
15
ctl/set.h
15
ctl/set.h
|
@ -241,8 +241,9 @@ class set
|
||||||
private:
|
private:
|
||||||
friend class set;
|
friend class set;
|
||||||
node_type* node_;
|
node_type* node_;
|
||||||
|
node_type* root_;
|
||||||
|
|
||||||
explicit reverse_iterator(node_type* node) : node_(node)
|
explicit reverse_iterator(node_type* node, node_type* root) : node_(node), root_(root)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -347,17 +348,17 @@ class set
|
||||||
|
|
||||||
reverse_iterator rbegin()
|
reverse_iterator rbegin()
|
||||||
{
|
{
|
||||||
return reverse_iterator(rightmost(root_));
|
return reverse_iterator(rightmost(root_), root_);
|
||||||
}
|
}
|
||||||
|
|
||||||
const_reverse_iterator rbegin() const
|
const_reverse_iterator rbegin() const
|
||||||
{
|
{
|
||||||
return const_reverse_iterator(rightmost(root_));
|
return const_reverse_iterator(rightmost(root_), root_);
|
||||||
}
|
}
|
||||||
|
|
||||||
const_reverse_iterator crbegin() const
|
const_reverse_iterator crbegin() const
|
||||||
{
|
{
|
||||||
return const_reverse_iterator(rightmost(root_));
|
return const_reverse_iterator(rightmost(root_), root_);
|
||||||
}
|
}
|
||||||
|
|
||||||
iterator end() noexcept
|
iterator end() noexcept
|
||||||
|
@ -377,17 +378,17 @@ class set
|
||||||
|
|
||||||
reverse_iterator rend()
|
reverse_iterator rend()
|
||||||
{
|
{
|
||||||
return reverse_iterator(nullptr);
|
return reverse_iterator(nullptr, root_);
|
||||||
}
|
}
|
||||||
|
|
||||||
const_reverse_iterator rend() const
|
const_reverse_iterator rend() const
|
||||||
{
|
{
|
||||||
return const_reverse_iterator(nullptr);
|
return const_reverse_iterator(nullptr, root_);
|
||||||
}
|
}
|
||||||
|
|
||||||
const_reverse_iterator crend() const
|
const_reverse_iterator crend() const
|
||||||
{
|
{
|
||||||
return const_reverse_iterator(nullptr);
|
return const_reverse_iterator(nullptr, root_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void clear() noexcept
|
void clear() noexcept
|
||||||
|
|
454
ctl/shared_ptr.h
Normal file
454
ctl/shared_ptr.h
Normal file
|
@ -0,0 +1,454 @@
|
||||||
|
// -*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-
|
||||||
|
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
||||||
|
#ifndef CTL_SHARED_PTR_H_
|
||||||
|
#define CTL_SHARED_PTR_H_
|
||||||
|
|
||||||
|
#include "exception.h"
|
||||||
|
#include "is_convertible.h"
|
||||||
|
#include "remove_extent.h"
|
||||||
|
#include "unique_ptr.h"
|
||||||
|
|
||||||
|
// XXX currently needed to use placement-new syntax (move to cxx.inc?)
|
||||||
|
void*
|
||||||
|
operator new(size_t, void*) noexcept;
|
||||||
|
|
||||||
|
namespace ctl {
|
||||||
|
|
||||||
|
class bad_weak_ptr : public exception
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
const char* what() const noexcept override
|
||||||
|
{
|
||||||
|
return "ctl::bad_weak_ptr";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
namespace __ {
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
struct ptr_ref
|
||||||
|
{
|
||||||
|
using type = T&;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
struct ptr_ref<void>
|
||||||
|
{
|
||||||
|
using type = void;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline __attribute__((always_inline)) void
|
||||||
|
incref(size_t* r) noexcept
|
||||||
|
{
|
||||||
|
#ifdef NDEBUG
|
||||||
|
__atomic_fetch_add(r, 1, __ATOMIC_RELAXED);
|
||||||
|
#else
|
||||||
|
size_t refs = __atomic_fetch_add(r, 1, __ATOMIC_RELAXED);
|
||||||
|
if (refs > ((size_t)-1) >> 1)
|
||||||
|
__builtin_trap();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline __attribute__((always_inline)) bool
|
||||||
|
decref(size_t* r) noexcept
|
||||||
|
{
|
||||||
|
if (!__atomic_fetch_sub(r, 1, __ATOMIC_RELEASE)) {
|
||||||
|
__atomic_thread_fence(__ATOMIC_ACQUIRE);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
class shared_ref
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
constexpr shared_ref() noexcept = default;
|
||||||
|
shared_ref(const shared_ref&) = delete;
|
||||||
|
shared_ref& operator=(const shared_ref&) = delete;
|
||||||
|
|
||||||
|
virtual ~shared_ref() = default;
|
||||||
|
|
||||||
|
void keep_shared() noexcept
|
||||||
|
{
|
||||||
|
incref(&shared);
|
||||||
|
}
|
||||||
|
|
||||||
|
void drop_shared() noexcept
|
||||||
|
{
|
||||||
|
if (decref(&shared)) {
|
||||||
|
dispose();
|
||||||
|
drop_weak();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void keep_weak() noexcept
|
||||||
|
{
|
||||||
|
incref(&weak);
|
||||||
|
}
|
||||||
|
|
||||||
|
void drop_weak() noexcept
|
||||||
|
{
|
||||||
|
if (decref(&weak)) {
|
||||||
|
delete this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t use_count() const noexcept
|
||||||
|
{
|
||||||
|
return shared + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t weak_count() const noexcept
|
||||||
|
{
|
||||||
|
return weak;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
virtual void dispose() noexcept = 0;
|
||||||
|
|
||||||
|
size_t shared = 0;
|
||||||
|
size_t weak = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T, typename D>
|
||||||
|
class shared_pointer : public shared_ref
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static shared_pointer* make(T* const p, D d)
|
||||||
|
{
|
||||||
|
return make(unique_ptr<T, D>(p, move(d)));
|
||||||
|
}
|
||||||
|
|
||||||
|
static shared_pointer* make(unique_ptr<T, D> p)
|
||||||
|
{
|
||||||
|
return new shared_pointer(p.release(), move(p.get_deleter()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
shared_pointer(T* const p, D d) noexcept : p(p), d(move(d))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
void dispose() noexcept override
|
||||||
|
{
|
||||||
|
move(d)(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
T* const p;
|
||||||
|
[[no_unique_address]] D d;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
class shared_emplace : public shared_ref
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
union
|
||||||
|
{
|
||||||
|
T t;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename... Args>
|
||||||
|
void construct(Args&&... args)
|
||||||
|
{
|
||||||
|
::new (&t) T(forward<Args>(args)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
static unique_ptr<shared_emplace> make()
|
||||||
|
{
|
||||||
|
return unique_ptr(new shared_emplace());
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
explicit constexpr shared_emplace() noexcept = default;
|
||||||
|
|
||||||
|
void dispose() noexcept override
|
||||||
|
{
|
||||||
|
t.~T();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T, typename U>
|
||||||
|
concept shared_ptr_compatible = is_convertible_v<U*, T*>;
|
||||||
|
|
||||||
|
} // namespace __
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
class weak_ptr;
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
class shared_ptr
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using element_type = remove_extent_t<T>;
|
||||||
|
using weak_type = weak_ptr<T>;
|
||||||
|
|
||||||
|
constexpr shared_ptr() noexcept = default;
|
||||||
|
constexpr shared_ptr(nullptr_t) noexcept
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
requires __::shared_ptr_compatible<T, U>
|
||||||
|
explicit shared_ptr(U* const p) : shared_ptr(p, default_delete<U>())
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U, typename D>
|
||||||
|
requires __::shared_ptr_compatible<T, U>
|
||||||
|
shared_ptr(U* const p, D d)
|
||||||
|
: p(p), rc(__::shared_pointer<U, D>::make(p, move(d)))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
shared_ptr(const shared_ptr<U>& r, element_type* p) noexcept
|
||||||
|
: p(p), rc(r.rc)
|
||||||
|
{
|
||||||
|
if (rc)
|
||||||
|
rc->keep_shared();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
shared_ptr(shared_ptr<U>&& r, element_type* p) noexcept : p(p), rc(r.rc)
|
||||||
|
{
|
||||||
|
r.p = nullptr;
|
||||||
|
r.rc = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
requires __::shared_ptr_compatible<T, U>
|
||||||
|
shared_ptr(const shared_ptr<U>& r) noexcept : p(r.p), rc(r.rc)
|
||||||
|
{
|
||||||
|
if (rc)
|
||||||
|
rc->keep_shared();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
requires __::shared_ptr_compatible<T, U>
|
||||||
|
shared_ptr(shared_ptr<U>&& r) noexcept : p(r.p), rc(r.rc)
|
||||||
|
{
|
||||||
|
r.p = nullptr;
|
||||||
|
r.rc = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
shared_ptr(const shared_ptr& r) noexcept : p(r.p), rc(r.rc)
|
||||||
|
{
|
||||||
|
if (rc)
|
||||||
|
rc->keep_shared();
|
||||||
|
}
|
||||||
|
|
||||||
|
shared_ptr(shared_ptr&& r) noexcept : p(r.p), rc(r.rc)
|
||||||
|
{
|
||||||
|
r.p = nullptr;
|
||||||
|
r.rc = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
requires __::shared_ptr_compatible<T, U>
|
||||||
|
explicit shared_ptr(const weak_ptr<U>& r) : p(r.p), rc(r.rc)
|
||||||
|
{
|
||||||
|
if (r.expired()) {
|
||||||
|
throw bad_weak_ptr();
|
||||||
|
}
|
||||||
|
rc->keep_shared();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U, typename D>
|
||||||
|
requires __::shared_ptr_compatible<T, U>
|
||||||
|
shared_ptr(unique_ptr<U, D>&& r)
|
||||||
|
: p(r.p), rc(__::shared_pointer<U, D>::make(move(r)))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
~shared_ptr()
|
||||||
|
{
|
||||||
|
if (rc)
|
||||||
|
rc->drop_shared();
|
||||||
|
}
|
||||||
|
|
||||||
|
shared_ptr& operator=(shared_ptr r) noexcept
|
||||||
|
{
|
||||||
|
swap(r);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
requires __::shared_ptr_compatible<T, U>
|
||||||
|
shared_ptr& operator=(shared_ptr<U> r) noexcept
|
||||||
|
{
|
||||||
|
shared_ptr<T>(move(r)).swap(*this);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset() noexcept
|
||||||
|
{
|
||||||
|
shared_ptr().swap(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
requires __::shared_ptr_compatible<T, U>
|
||||||
|
void reset(U* const p2)
|
||||||
|
{
|
||||||
|
shared_ptr<T>(p2).swap(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U, typename D>
|
||||||
|
requires __::shared_ptr_compatible<T, U>
|
||||||
|
void reset(U* const p2, D d)
|
||||||
|
{
|
||||||
|
shared_ptr<T>(p2, d).swap(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
void swap(shared_ptr& r) noexcept
|
||||||
|
{
|
||||||
|
using ctl::swap;
|
||||||
|
swap(p, r.p);
|
||||||
|
swap(rc, r.rc);
|
||||||
|
}
|
||||||
|
|
||||||
|
element_type* get() const noexcept
|
||||||
|
{
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
typename __::ptr_ref<T>::type operator*() const noexcept
|
||||||
|
{
|
||||||
|
if (!p)
|
||||||
|
__builtin_trap();
|
||||||
|
return *p;
|
||||||
|
}
|
||||||
|
|
||||||
|
T* operator->() const noexcept
|
||||||
|
{
|
||||||
|
if (!p)
|
||||||
|
__builtin_trap();
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
long use_count() const noexcept
|
||||||
|
{
|
||||||
|
return rc ? rc->use_count() : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit operator bool() const noexcept
|
||||||
|
{
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
bool owner_before(const shared_ptr<U>& r) const noexcept
|
||||||
|
{
|
||||||
|
return p < r.p;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
bool owner_before(const weak_ptr<U>& r) const noexcept
|
||||||
|
{
|
||||||
|
return !r.owner_before(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
template<typename U>
|
||||||
|
friend class weak_ptr;
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
friend class shared_ptr;
|
||||||
|
|
||||||
|
template<typename U, typename... Args>
|
||||||
|
friend shared_ptr<U> make_shared(Args&&... args);
|
||||||
|
|
||||||
|
element_type* p = nullptr;
|
||||||
|
__::shared_ref* rc = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
class weak_ptr
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
using element_type = remove_extent_t<T>;
|
||||||
|
|
||||||
|
constexpr weak_ptr() noexcept = default;
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
requires __::shared_ptr_compatible<T, U>
|
||||||
|
weak_ptr(const shared_ptr<U>& r) noexcept : p(r.p), rc(r.rc)
|
||||||
|
{
|
||||||
|
if (rc)
|
||||||
|
rc->keep_weak();
|
||||||
|
}
|
||||||
|
|
||||||
|
~weak_ptr()
|
||||||
|
{
|
||||||
|
if (rc)
|
||||||
|
rc->drop_weak();
|
||||||
|
}
|
||||||
|
|
||||||
|
long use_count() const noexcept
|
||||||
|
{
|
||||||
|
return rc ? rc->use_count() : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool expired() const noexcept
|
||||||
|
{
|
||||||
|
return !use_count();
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset() noexcept
|
||||||
|
{
|
||||||
|
weak_ptr().swap(*this);
|
||||||
|
}
|
||||||
|
|
||||||
|
void swap(weak_ptr& r) noexcept
|
||||||
|
{
|
||||||
|
using ctl::swap;
|
||||||
|
swap(p, r.p);
|
||||||
|
swap(rc, r.rc);
|
||||||
|
}
|
||||||
|
|
||||||
|
shared_ptr<T> lock() const noexcept
|
||||||
|
{
|
||||||
|
if (expired())
|
||||||
|
return nullptr;
|
||||||
|
shared_ptr<T> r;
|
||||||
|
r.p = p;
|
||||||
|
r.rc = rc;
|
||||||
|
if (rc)
|
||||||
|
rc->keep_shared();
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
bool owner_before(const weak_ptr<U>& r) const noexcept
|
||||||
|
{
|
||||||
|
return p < r.p;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename U>
|
||||||
|
bool owner_before(const shared_ptr<U>& r) const noexcept
|
||||||
|
{
|
||||||
|
return p < r.p;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
template<typename U>
|
||||||
|
friend class shared_ptr;
|
||||||
|
|
||||||
|
element_type* p = nullptr;
|
||||||
|
__::shared_ref* rc = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename T, typename... Args>
|
||||||
|
shared_ptr<T>
|
||||||
|
make_shared(Args&&... args)
|
||||||
|
{
|
||||||
|
auto rc = __::shared_emplace<T>::make();
|
||||||
|
rc->construct(forward<Args>(args)...);
|
||||||
|
shared_ptr<T> r;
|
||||||
|
r.p = &rc->t;
|
||||||
|
r.rc = rc.release();
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace ctl
|
||||||
|
|
||||||
|
#endif // CTL_SHARED_PTR_H_
|
|
@ -94,6 +94,7 @@ EXAMPLES_DIRECTDEPS = \
|
||||||
THIRD_PARTY_VQSORT \
|
THIRD_PARTY_VQSORT \
|
||||||
THIRD_PARTY_XED \
|
THIRD_PARTY_XED \
|
||||||
THIRD_PARTY_LIBCXXABI \
|
THIRD_PARTY_LIBCXXABI \
|
||||||
|
THIRD_PARTY_LIBUNWIND \
|
||||||
THIRD_PARTY_ZLIB \
|
THIRD_PARTY_ZLIB \
|
||||||
TOOL_ARGS \
|
TOOL_ARGS \
|
||||||
TOOL_BUILD_LIB \
|
TOOL_BUILD_LIB \
|
||||||
|
|
|
@ -38,6 +38,7 @@
|
||||||
#include "libc/sysv/consts/prot.h"
|
#include "libc/sysv/consts/prot.h"
|
||||||
|
|
||||||
static struct {
|
static struct {
|
||||||
|
atomic_uint once;
|
||||||
const char *res;
|
const char *res;
|
||||||
char buf[PATH_MAX];
|
char buf[PATH_MAX];
|
||||||
} g_comdbg;
|
} g_comdbg;
|
||||||
|
@ -124,10 +125,11 @@ static void FindDebugBinaryInit(void) {
|
||||||
* @asyncsignalsafe
|
* @asyncsignalsafe
|
||||||
*/
|
*/
|
||||||
const char *FindDebugBinary(void) {
|
const char *FindDebugBinary(void) {
|
||||||
|
cosmo_once(&g_comdbg.once, FindDebugBinaryInit);
|
||||||
return g_comdbg.res;
|
return g_comdbg.res;
|
||||||
}
|
}
|
||||||
|
|
||||||
// pay startup cost to make this signal safe from the user's perspective
|
// pay startup cost to make this signal safe from the user's perspective
|
||||||
__attribute__((__constructor__(10))) static void FindDebugBinaryCtor(void) {
|
__attribute__((__constructor__(10))) static void FindDebugBinaryCtor(void) {
|
||||||
FindDebugBinaryInit();
|
cosmo_once(&g_comdbg.once, FindDebugBinaryInit);
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,7 +42,3 @@ void *dlsym(void *, const char *) {
|
||||||
int dlclose(void *) {
|
int dlclose(void *) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int dl_iterate_phdr(int (*)(void *, size_t, void *), void *) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define __COSMOPOLITAN_MAJOR__ 3
|
#define __COSMOPOLITAN_MAJOR__ 3
|
||||||
#define __COSMOPOLITAN_MINOR__ 7
|
#define __COSMOPOLITAN_MINOR__ 8
|
||||||
#define __COSMOPOLITAN_PATCH__ 1
|
#define __COSMOPOLITAN_PATCH__ 0
|
||||||
#define __COSMOPOLITAN__ \
|
#define __COSMOPOLITAN__ \
|
||||||
(100000000 * __COSMOPOLITAN_MAJOR__ + 1000000 * __COSMOPOLITAN_MINOR__ + \
|
(100000000 * __COSMOPOLITAN_MAJOR__ + 1000000 * __COSMOPOLITAN_MINOR__ + \
|
||||||
__COSMOPOLITAN_PATCH__)
|
__COSMOPOLITAN_PATCH__)
|
||||||
|
@ -93,6 +93,30 @@
|
||||||
#include "libc/integral/llp64.inc"
|
#include "libc/integral/llp64.inc"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#undef __INT_FAST16_MAX__
|
||||||
|
#undef __INT_FAST16_TYPE__
|
||||||
|
#undef __UINT_FAST16_MAX__
|
||||||
|
#undef __INT_FAST16_WIDTH__
|
||||||
|
#undef __UINT_FAST16_TYPE__
|
||||||
|
|
||||||
|
#define __INT_FAST16_MAX__ 2147483647
|
||||||
|
#define __INT_FAST16_TYPE__ int
|
||||||
|
#define __UINT_FAST16_MAX__ 4294967295U
|
||||||
|
#define __INT_FAST16_WIDTH__ 32
|
||||||
|
#define __UINT_FAST16_TYPE__ unsigned int
|
||||||
|
|
||||||
|
#undef __INT_FAST32_MAX__
|
||||||
|
#undef __INT_FAST32_TYPE__
|
||||||
|
#undef __UINT_FAST32_MAX__
|
||||||
|
#undef __INT_FAST32_WIDTH__
|
||||||
|
#undef __UINT_FAST32_TYPE__
|
||||||
|
|
||||||
|
#define __INT_FAST32_MAX__ 2147483647
|
||||||
|
#define __INT_FAST32_TYPE__ int
|
||||||
|
#define __UINT_FAST32_MAX__ 4294967295U
|
||||||
|
#define __INT_FAST32_WIDTH__ 32
|
||||||
|
#define __UINT_FAST32_TYPE__ unsigned int
|
||||||
|
|
||||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||||
#ifdef __STDC__
|
#ifdef __STDC__
|
||||||
#include "libc/integral/c.inc"
|
#include "libc/integral/c.inc"
|
||||||
|
|
22
libc/intrin/personality.c
Normal file
22
libc/intrin/personality.c
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||||
|
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │
|
||||||
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||||
|
│ Copyright 2024 Justine Alexandra Roberts Tunney │
|
||||||
|
│ │
|
||||||
|
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||||
|
│ any purpose with or without fee is hereby granted, provided that the │
|
||||||
|
│ above copyright notice and this permission notice appear in all copies. │
|
||||||
|
│ │
|
||||||
|
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||||
|
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||||
|
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||||
|
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||||
|
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||||
|
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||||
|
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||||
|
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||||
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||||
|
|
||||||
|
__attribute__((__weak__)) void __gxx_personality_v0() {
|
||||||
|
__builtin_trap();
|
||||||
|
}
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/ammintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/ammintrin.internal.h"
|
#include "third_party/intel/ammintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/amxcomplexintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/amxcomplexintrin.internal.h"
|
#include "third_party/intel/amxcomplexintrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/amxfp16intrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/amxfp16intrin.internal.h"
|
#include "third_party/intel/amxfp16intrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/aarch64/clang/arm_acle.h"
|
||||||
|
#else
|
||||||
#include "third_party/aarch64/arm_acle.internal.h"
|
#include "third_party/aarch64/arm_acle.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/aarch64/clang/arm_bf16.h"
|
||||||
|
#else
|
||||||
#include "third_party/aarch64/arm_bf16.internal.h"
|
#include "third_party/aarch64/arm_bf16.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/aarch64/clang/arm_fp16.h"
|
||||||
|
#else
|
||||||
#include "third_party/aarch64/arm_fp16.internal.h"
|
#include "third_party/aarch64/arm_fp16.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/aarch64/clang/arm_neon.h"
|
||||||
|
#else
|
||||||
#include "third_party/aarch64/arm_neon.internal.h"
|
#include "third_party/aarch64/arm_neon.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_ */
|
||||||
|
|
8
libc/isystem/arm_sve.h
Normal file
8
libc/isystem/arm_sve.h
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_
|
||||||
|
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/aarch64/clang/arm_sve.h"
|
||||||
|
#else
|
||||||
|
#include "third_party/aarch64/arm_sve.internal.h"
|
||||||
|
#endif
|
||||||
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_ */
|
8
libc/isystem/arm_vector_types.h
Normal file
8
libc/isystem/arm_vector_types.h
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_
|
||||||
|
#define COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/aarch64/clang/arm_vector_types.h"
|
||||||
|
#else
|
||||||
|
#include "third_party/aarch64/arm_vector_types.internal.h"
|
||||||
|
#endif
|
||||||
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_ */
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/avxifmaintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/avxifmaintrin.internal.h"
|
#include "third_party/intel/avxifmaintrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/avxneconvertintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/avxneconvertintrin.internal.h"
|
#include "third_party/intel/avxneconvertintrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/avxvnniint16intrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/avxvnniint16intrin.internal.h"
|
#include "third_party/intel/avxvnniint16intrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/avxvnniint8intrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/avxvnniint8intrin.internal.h"
|
#include "third_party/intel/avxvnniint8intrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/clzerointrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/clzerointrin.internal.h"
|
#include "third_party/intel/clzerointrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/cmpccxaddintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/cmpccxaddintrin.internal.h"
|
#include "third_party/intel/cmpccxaddintrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/emmintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/emmintrin.internal.h"
|
#include "third_party/intel/emmintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/immintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/immintrin.internal.h"
|
#include "third_party/intel/immintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/mm_malloc.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/mm_malloc.internal.h"
|
#include "third_party/intel/mm_malloc.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/mmintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/mmintrin.internal.h"
|
#include "third_party/intel/mmintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/mwaitxintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/mwaitxintrin.internal.h"
|
#include "third_party/intel/mwaitxintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/nmmintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/nmmintrin.internal.h"
|
#include "third_party/intel/nmmintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/pmmintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/pmmintrin.internal.h"
|
#include "third_party/intel/pmmintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/popcntintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/popcntintrin.internal.h"
|
#include "third_party/intel/popcntintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/prfchiintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/prfchiintrin.internal.h"
|
#include "third_party/intel/prfchiintrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/raointintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/raointintrin.internal.h"
|
#include "third_party/intel/raointintrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/sgxintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/sgxintrin.internal.h"
|
#include "third_party/intel/sgxintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/sha512intrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/sha512intrin.internal.h"
|
#include "third_party/intel/sha512intrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/sm3intrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/sm3intrin.internal.h"
|
#include "third_party/intel/sm3intrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/sm4intrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/sm4intrin.internal.h"
|
#include "third_party/intel/sm4intrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/smmintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/smmintrin.internal.h"
|
#include "third_party/intel/smmintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/tmmintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/tmmintrin.internal.h"
|
#include "third_party/intel/tmmintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1 +1,5 @@
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/usermsrintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/usermsrintrin.internal.h"
|
#include "third_party/intel/usermsrintrin.internal.h"
|
||||||
|
#endif
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/wmmintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/wmmintrin.internal.h"
|
#include "third_party/intel/wmmintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/x86intrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/x86intrin.internal.h"
|
#include "third_party/intel/x86intrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
|
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
|
||||||
#define COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
|
#define COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
|
||||||
|
#ifdef __clang__
|
||||||
|
#include "third_party/intel/clang/xmmintrin.h"
|
||||||
|
#else
|
||||||
#include "third_party/intel/xmmintrin.internal.h"
|
#include "third_party/intel/xmmintrin.internal.h"
|
||||||
|
#endif
|
||||||
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_ */
|
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_ */
|
||||||
|
|
|
@ -42,7 +42,8 @@ $(LIBC_MEM_A_OBJS): private \
|
||||||
COPTS += \
|
COPTS += \
|
||||||
-fno-sanitize=all \
|
-fno-sanitize=all \
|
||||||
-Wframe-larger-than=4096 \
|
-Wframe-larger-than=4096 \
|
||||||
-Walloca-larger-than=4096
|
-Walloca-larger-than=4096 \
|
||||||
|
-fexceptions
|
||||||
|
|
||||||
o/$(MODE)/libc/mem/asan.o: private \
|
o/$(MODE)/libc/mem/asan.o: private \
|
||||||
CFLAGS += \
|
CFLAGS += \
|
||||||
|
|
|
@ -7,10 +7,10 @@ void *bsearch(const void *, const void *, size_t, size_t,
|
||||||
void *bsearch_r(const void *, const void *, size_t, size_t,
|
void *bsearch_r(const void *, const void *, size_t, size_t,
|
||||||
int (*)(const void *, const void *, void *), void *)
|
int (*)(const void *, const void *, void *), void *)
|
||||||
paramsnonnull((1, 2, 5)) nosideeffect;
|
paramsnonnull((1, 2, 5)) nosideeffect;
|
||||||
void qsort3(void *, size_t, size_t,
|
void qsort3(void *, size_t, size_t, int (*)(const void *, const void *))
|
||||||
int (*)(const void *, const void *)) libcesque paramsnonnull();
|
paramsnonnull();
|
||||||
void qsort(void *, size_t, size_t,
|
void qsort(void *, size_t, size_t, int (*)(const void *, const void *))
|
||||||
int (*)(const void *, const void *)) libcesque paramsnonnull();
|
paramsnonnull();
|
||||||
void qsort_r(void *, size_t, size_t,
|
void qsort_r(void *, size_t, size_t,
|
||||||
int (*)(const void *, const void *, void *), void *)
|
int (*)(const void *, const void *, void *), void *)
|
||||||
paramsnonnull((1, 4));
|
paramsnonnull((1, 4));
|
||||||
|
|
|
@ -76,9 +76,9 @@
|
||||||
#define FLAGS_PRECISION 0x20
|
#define FLAGS_PRECISION 0x20
|
||||||
#define FLAGS_ISSIGNED 0x40
|
#define FLAGS_ISSIGNED 0x40
|
||||||
#define FLAGS_NOQUOTE 0x80
|
#define FLAGS_NOQUOTE 0x80
|
||||||
|
#define FLAGS_REPR 0x100
|
||||||
#define FLAGS_QUOTE FLAGS_SPACE
|
#define FLAGS_QUOTE FLAGS_SPACE
|
||||||
#define FLAGS_GROUPING FLAGS_NOQUOTE
|
#define FLAGS_GROUPING FLAGS_NOQUOTE
|
||||||
#define FLAGS_REPR FLAGS_PLUS
|
|
||||||
|
|
||||||
#define __FMT_PUT(C) \
|
#define __FMT_PUT(C) \
|
||||||
do { \
|
do { \
|
||||||
|
|
|
@ -105,7 +105,7 @@ int main(int argc, char *argv[]) {
|
||||||
__log_level = kLogInfo;
|
__log_level = kLogInfo;
|
||||||
GetOpts(argc, argv);
|
GetOpts(argc, argv);
|
||||||
|
|
||||||
for (fd = 3; fd < 10; ++fd) {
|
for (fd = 3; fd < 100; ++fd) {
|
||||||
close(fd);
|
close(fd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
248
test/ctl/shared_ptr_test.cc
Normal file
248
test/ctl/shared_ptr_test.cc
Normal file
|
@ -0,0 +1,248 @@
|
||||||
|
// -*- mode:c++; indent-tabs-mode:nil; c-basic-offset:4; coding:utf-8 -*-
|
||||||
|
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
||||||
|
//
|
||||||
|
// Copyright 2024 Justine Alexandra Roberts Tunney
|
||||||
|
//
|
||||||
|
// Permission to use, copy, modify, and/or distribute this software for
|
||||||
|
// any purpose with or without fee is hereby granted, provided that the
|
||||||
|
// above copyright notice and this permission notice appear in all copies.
|
||||||
|
//
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
|
||||||
|
// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
|
||||||
|
// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
|
||||||
|
// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
|
||||||
|
// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
|
||||||
|
// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||||||
|
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||||
|
// PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
|
||||||
|
#include "ctl/shared_ptr.h"
|
||||||
|
#include "ctl/vector.h"
|
||||||
|
#include "libc/mem/leaks.h"
|
||||||
|
|
||||||
|
// #include <memory>
|
||||||
|
// #include <vector>
|
||||||
|
// #define ctl std
|
||||||
|
|
||||||
|
using ctl::bad_weak_ptr;
|
||||||
|
using ctl::make_shared;
|
||||||
|
using ctl::move;
|
||||||
|
using ctl::shared_ptr;
|
||||||
|
using ctl::unique_ptr;
|
||||||
|
using ctl::vector;
|
||||||
|
using ctl::weak_ptr;
|
||||||
|
|
||||||
|
#undef ctl
|
||||||
|
|
||||||
|
static int g = 0;
|
||||||
|
|
||||||
|
struct ConstructG
|
||||||
|
{
|
||||||
|
ConstructG()
|
||||||
|
{
|
||||||
|
++g;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct DestructG
|
||||||
|
{
|
||||||
|
~DestructG()
|
||||||
|
{
|
||||||
|
++g;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct CallG
|
||||||
|
{
|
||||||
|
void operator()(auto*) const noexcept
|
||||||
|
{
|
||||||
|
++g;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Base
|
||||||
|
{};
|
||||||
|
|
||||||
|
struct Derived : Base
|
||||||
|
{};
|
||||||
|
|
||||||
|
int
|
||||||
|
main()
|
||||||
|
{
|
||||||
|
int a, b;
|
||||||
|
|
||||||
|
{
|
||||||
|
// Shouldn't cause memory leaks.
|
||||||
|
shared_ptr<int> x(new int(5));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Objects get destroyed when the last shared_ptr is reset.
|
||||||
|
shared_ptr<int> x(&a, CallG());
|
||||||
|
shared_ptr<int> y(x);
|
||||||
|
x.reset();
|
||||||
|
if (g)
|
||||||
|
return 1;
|
||||||
|
y.reset();
|
||||||
|
if (g != 1)
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
g = 0;
|
||||||
|
// Weak pointers don't prevent object destruction.
|
||||||
|
shared_ptr<int> x(&a, CallG());
|
||||||
|
weak_ptr<int> y(x);
|
||||||
|
x.reset();
|
||||||
|
if (g != 1)
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
g = 0;
|
||||||
|
// Weak pointers can be promoted to shared pointers.
|
||||||
|
shared_ptr<int> x(&a, CallG());
|
||||||
|
weak_ptr<int> y(x);
|
||||||
|
auto z = y.lock();
|
||||||
|
x.reset();
|
||||||
|
if (g)
|
||||||
|
return 4;
|
||||||
|
y.reset();
|
||||||
|
if (g)
|
||||||
|
return 5;
|
||||||
|
z.reset();
|
||||||
|
if (g != 1)
|
||||||
|
return 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Shared null pointers are falsey.
|
||||||
|
shared_ptr<int> x;
|
||||||
|
if (x)
|
||||||
|
return 7;
|
||||||
|
x.reset(new int);
|
||||||
|
if (!x)
|
||||||
|
return 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// You can cast a shared pointer validly.
|
||||||
|
shared_ptr<Derived> x(new Derived);
|
||||||
|
shared_ptr<Base> y(x);
|
||||||
|
// But not invalidly:
|
||||||
|
// shared_ptr<Base> x(new Derived);
|
||||||
|
// shared_ptr<Derived> y(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// You can cast a shared pointer to void to retain a reference.
|
||||||
|
shared_ptr<int> x(new int);
|
||||||
|
shared_ptr<void> y(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// You can also create a shared pointer to void in the first place.
|
||||||
|
shared_ptr<void> x(new int);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// You can take a shared pointer to a subobject, and it will free the
|
||||||
|
// base object.
|
||||||
|
shared_ptr<vector<int>> x(new vector<int>);
|
||||||
|
x->push_back(5);
|
||||||
|
shared_ptr<int> y(x, &x->at(0));
|
||||||
|
x.reset();
|
||||||
|
if (*y != 5)
|
||||||
|
return 9;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
g = 0;
|
||||||
|
// You can create a shared_ptr from a unique_ptr.
|
||||||
|
unique_ptr<int, CallG> x(&a, CallG());
|
||||||
|
shared_ptr<int> y(move(x));
|
||||||
|
if (x)
|
||||||
|
return 10;
|
||||||
|
y.reset();
|
||||||
|
if (g != 1)
|
||||||
|
return 11;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
g = 0;
|
||||||
|
// You can reassign shared_ptrs.
|
||||||
|
shared_ptr<int> x(&a, CallG());
|
||||||
|
shared_ptr<int> y;
|
||||||
|
y = x;
|
||||||
|
x.reset();
|
||||||
|
if (g)
|
||||||
|
return 12;
|
||||||
|
y.reset();
|
||||||
|
if (g != 1)
|
||||||
|
return 13;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// owner_before works across shared and weak pointers.
|
||||||
|
shared_ptr<int> x(&a, CallG());
|
||||||
|
shared_ptr<int> y(&b, CallG());
|
||||||
|
if (!x.owner_before(y))
|
||||||
|
return 14;
|
||||||
|
if (!x.owner_before(weak_ptr<int>(y)))
|
||||||
|
return 15;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Use counts work like you'd expect
|
||||||
|
shared_ptr<int> x(new int);
|
||||||
|
if (x.use_count() != 1)
|
||||||
|
return 16;
|
||||||
|
shared_ptr<int> y(x);
|
||||||
|
if (x.use_count() != 2 || y.use_count() != 2)
|
||||||
|
return 17;
|
||||||
|
x.reset();
|
||||||
|
if (x.use_count() != 0 || y.use_count() != 1)
|
||||||
|
return 18;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// There is a make_shared that will allocate an object for you safely.
|
||||||
|
auto x = make_shared<int>(5);
|
||||||
|
if (!x)
|
||||||
|
return 19;
|
||||||
|
if (*x != 5)
|
||||||
|
return 20;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Expired weak pointers lock to nullptr, and throw when promoted to
|
||||||
|
// shared pointer by constructor.
|
||||||
|
auto x = make_shared<int>();
|
||||||
|
weak_ptr<int> y(x);
|
||||||
|
x.reset();
|
||||||
|
if (y.lock())
|
||||||
|
return 21;
|
||||||
|
int caught = 0;
|
||||||
|
try {
|
||||||
|
shared_ptr<int> z(y);
|
||||||
|
} catch (bad_weak_ptr& e) {
|
||||||
|
caught = 1;
|
||||||
|
}
|
||||||
|
if (!caught)
|
||||||
|
return 22;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// nullptr is always expired.
|
||||||
|
shared_ptr<int> x(nullptr);
|
||||||
|
weak_ptr<int> y(x);
|
||||||
|
if (!y.expired())
|
||||||
|
return 23;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO(mrdomino): exercise threads / races. The reference count should be
|
||||||
|
// atomically maintained.
|
||||||
|
|
||||||
|
CheckForMemoryLeaks();
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -27,3 +27,11 @@ TEST(snprintf, testVeryLargePrecision) {
|
||||||
ASSERT_EQ(i, 9999);
|
ASSERT_EQ(i, 9999);
|
||||||
ASSERT_EQ(strlen(buf), 511);
|
ASSERT_EQ(strlen(buf), 511);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(snprintf, testPlusFlagOnChar) {
|
||||||
|
char buf[10] = {};
|
||||||
|
int i = snprintf(buf, sizeof(buf), "%+c", '=');
|
||||||
|
|
||||||
|
ASSERT_EQ(i, 1);
|
||||||
|
ASSERT_STREQ(buf, "=");
|
||||||
|
}
|
||||||
|
|
|
@ -48,6 +48,8 @@ TEST_LIBC_TINYMATH_DIRECTDEPS = \
|
||||||
THIRD_PARTY_DOUBLECONVERSION \
|
THIRD_PARTY_DOUBLECONVERSION \
|
||||||
THIRD_PARTY_GDTOA \
|
THIRD_PARTY_GDTOA \
|
||||||
THIRD_PARTY_LIBCXX \
|
THIRD_PARTY_LIBCXX \
|
||||||
|
THIRD_PARTY_LIBCXXABI \
|
||||||
|
THIRD_PARTY_LIBUNWIND \
|
||||||
|
|
||||||
TEST_LIBC_TINYMATH_DEPS := \
|
TEST_LIBC_TINYMATH_DEPS := \
|
||||||
$(call uniq,$(foreach x,$(TEST_LIBC_TINYMATH_DIRECTDEPS),$($(x))))
|
$(call uniq,$(foreach x,$(TEST_LIBC_TINYMATH_DIRECTDEPS),$($(x))))
|
||||||
|
|
|
@ -21,13 +21,6 @@
|
||||||
#include "libc/mem/mem.h"
|
#include "libc/mem/mem.h"
|
||||||
#include "libc/runtime/runtime.h"
|
#include "libc/runtime/runtime.h"
|
||||||
|
|
||||||
// this dontthrow keyword SHOULD break this test. it's probably passing
|
|
||||||
// because we're currently using SjLj exceptions. the day we can change
|
|
||||||
// things, remove `dontthrow` and this test will still be a useful help
|
|
||||||
extern "C" dontthrow void qsort_(void *, size_t, size_t,
|
|
||||||
int (*)(const void *,
|
|
||||||
const void *)) asm("qsort");
|
|
||||||
|
|
||||||
struct Resource {
|
struct Resource {
|
||||||
char *p;
|
char *p;
|
||||||
Resource() {
|
Resource() {
|
||||||
|
@ -60,7 +53,7 @@ int A[3] = {3, 2, 1};
|
||||||
int Work(void) {
|
int Work(void) {
|
||||||
Resource r;
|
Resource r;
|
||||||
pPoke(r.p);
|
pPoke(r.p);
|
||||||
qsort_(A, 3, sizeof(int), cmp);
|
qsort(A, 3, sizeof(int), cmp);
|
||||||
return A[0];
|
return A[0];
|
||||||
}
|
}
|
||||||
int (*pWork)(void) = Work;
|
int (*pWork)(void) = Work;
|
||||||
|
|
2
third_party/aarch64/BUILD.mk
vendored
2
third_party/aarch64/BUILD.mk
vendored
|
@ -3,4 +3,4 @@
|
||||||
|
|
||||||
PKGS += THIRD_PARTY_AARCH64
|
PKGS += THIRD_PARTY_AARCH64
|
||||||
THIRD_PARTY_AARCH64_HDRS = $(filter %.h,$(THIRD_PARTY_AARCH64_FILES))
|
THIRD_PARTY_AARCH64_HDRS = $(filter %.h,$(THIRD_PARTY_AARCH64_FILES))
|
||||||
THIRD_PARTY_AARCH64_FILES := $(wildcard third_party/aarch64/*)
|
THIRD_PARTY_AARCH64_FILES := $(wildcard third_party/aarch64/*) $(wildcard third_party/aarch64/clang/*)
|
||||||
|
|
35
third_party/aarch64/clang/arm64intr.h
vendored
Normal file
35
third_party/aarch64/clang/arm64intr.h
vendored
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Only include this if we're compiling for the windows platform. */
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
#include_next <arm64intr.h>
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifndef __ARM64INTR_H
|
||||||
|
#define __ARM64INTR_H
|
||||||
|
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
_ARM64_BARRIER_SY = 0xF,
|
||||||
|
_ARM64_BARRIER_ST = 0xE,
|
||||||
|
_ARM64_BARRIER_LD = 0xD,
|
||||||
|
_ARM64_BARRIER_ISH = 0xB,
|
||||||
|
_ARM64_BARRIER_ISHST = 0xA,
|
||||||
|
_ARM64_BARRIER_ISHLD = 0x9,
|
||||||
|
_ARM64_BARRIER_NSH = 0x7,
|
||||||
|
_ARM64_BARRIER_NSHST = 0x6,
|
||||||
|
_ARM64_BARRIER_NSHLD = 0x5,
|
||||||
|
_ARM64_BARRIER_OSH = 0x3,
|
||||||
|
_ARM64_BARRIER_OSHST = 0x2,
|
||||||
|
_ARM64_BARRIER_OSHLD = 0x1
|
||||||
|
} _ARM64INTR_BARRIER_TYPE;
|
||||||
|
|
||||||
|
#endif /* __ARM64INTR_H */
|
||||||
|
#endif /* _MSC_VER */
|
888
third_party/aarch64/clang/arm_acle.h
vendored
Normal file
888
third_party/aarch64/clang/arm_acle.h
vendored
Normal file
|
@ -0,0 +1,888 @@
|
||||||
|
/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
* The Arm C Language Extensions specifications can be found in the following
|
||||||
|
* link: https://github.com/ARM-software/acle/releases
|
||||||
|
*
|
||||||
|
* The ACLE section numbers are subject to change. When consulting the
|
||||||
|
* specifications, it is recommended to search using section titles if
|
||||||
|
* the section numbers look outdated.
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __ARM_ACLE_H
|
||||||
|
#define __ARM_ACLE_H
|
||||||
|
|
||||||
|
#ifndef __ARM_ACLE
|
||||||
|
#error "ACLE intrinsics support not enabled."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
|
||||||
|
/* 7.3 Memory barriers */
|
||||||
|
#if !__has_builtin(__dmb)
|
||||||
|
#define __dmb(i) __builtin_arm_dmb(i)
|
||||||
|
#endif
|
||||||
|
#if !__has_builtin(__dsb)
|
||||||
|
#define __dsb(i) __builtin_arm_dsb(i)
|
||||||
|
#endif
|
||||||
|
#if !__has_builtin(__isb)
|
||||||
|
#define __isb(i) __builtin_arm_isb(i)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 7.4 Hints */
|
||||||
|
|
||||||
|
#if !__has_builtin(__wfi)
|
||||||
|
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
|
||||||
|
__builtin_arm_wfi();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !__has_builtin(__wfe)
|
||||||
|
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
|
||||||
|
__builtin_arm_wfe();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !__has_builtin(__sev)
|
||||||
|
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
|
||||||
|
__builtin_arm_sev();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !__has_builtin(__sevl)
|
||||||
|
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
|
||||||
|
__builtin_arm_sevl();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !__has_builtin(__yield)
|
||||||
|
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
|
||||||
|
__builtin_arm_yield();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||||
|
#define __dbg(t) __builtin_arm_dbg(t)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||||
|
#define _CHKFEAT_GCS 1
|
||||||
|
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__chkfeat(uint64_t __features) {
|
||||||
|
return __builtin_arm_chkfeat(__features) ^ __features;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 7.5 Swap */
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__swp(uint32_t __x, volatile uint32_t *__p) {
|
||||||
|
uint32_t v;
|
||||||
|
do
|
||||||
|
v = __builtin_arm_ldrex(__p);
|
||||||
|
while (__builtin_arm_strex(__x, __p));
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 7.6 Memory prefetch intrinsics */
|
||||||
|
/* 7.6.1 Data prefetch */
|
||||||
|
#define __pld(addr) __pldx(0, 0, 0, addr)
|
||||||
|
|
||||||
|
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||||
|
#define __pldx(access_kind, cache_level, retention_policy, addr) \
|
||||||
|
__builtin_arm_prefetch(addr, access_kind, 1)
|
||||||
|
#else
|
||||||
|
#define __pldx(access_kind, cache_level, retention_policy, addr) \
|
||||||
|
__builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 7.6.2 Instruction prefetch */
|
||||||
|
#define __pli(addr) __plix(0, 0, addr)
|
||||||
|
|
||||||
|
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||||
|
#define __plix(cache_level, retention_policy, addr) \
|
||||||
|
__builtin_arm_prefetch(addr, 0, 0)
|
||||||
|
#else
|
||||||
|
#define __plix(cache_level, retention_policy, addr) \
|
||||||
|
__builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 7.7 NOP */
|
||||||
|
#if !defined(_MSC_VER) || (!defined(__aarch64__) && !defined(__arm64ec__))
|
||||||
|
static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
|
||||||
|
__builtin_arm_nop();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8 DATA-PROCESSING INTRINSICS */
|
||||||
|
/* 8.2 Miscellaneous data-processing intrinsics */
|
||||||
|
/* ROR */
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__ror(uint32_t __x, uint32_t __y) {
|
||||||
|
__y %= 32;
|
||||||
|
if (__y == 0)
|
||||||
|
return __x;
|
||||||
|
return (__x >> __y) | (__x << (32 - __y));
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rorll(uint64_t __x, uint32_t __y) {
|
||||||
|
__y %= 64;
|
||||||
|
if (__y == 0)
|
||||||
|
return __x;
|
||||||
|
return (__x >> __y) | (__x << (64 - __y));
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rorl(unsigned long __x, uint32_t __y) {
|
||||||
|
#if __SIZEOF_LONG__ == 4
|
||||||
|
return __ror(__x, __y);
|
||||||
|
#else
|
||||||
|
return __rorll(__x, __y);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* CLZ */
|
||||||
|
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__clz(uint32_t __t) {
|
||||||
|
return __builtin_arm_clz(__t);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__clzl(unsigned long __t) {
|
||||||
|
#if __SIZEOF_LONG__ == 4
|
||||||
|
return __builtin_arm_clz(__t);
|
||||||
|
#else
|
||||||
|
return __builtin_arm_clz64(__t);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__clzll(uint64_t __t) {
|
||||||
|
return __builtin_arm_clz64(__t);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* CLS */
|
||||||
|
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__cls(uint32_t __t) {
|
||||||
|
return __builtin_arm_cls(__t);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__clsl(unsigned long __t) {
|
||||||
|
#if __SIZEOF_LONG__ == 4
|
||||||
|
return __builtin_arm_cls(__t);
|
||||||
|
#else
|
||||||
|
return __builtin_arm_cls64(__t);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__clsll(uint64_t __t) {
|
||||||
|
return __builtin_arm_cls64(__t);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* REV */
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rev(uint32_t __t) {
|
||||||
|
return __builtin_bswap32(__t);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__revl(unsigned long __t) {
|
||||||
|
#if __SIZEOF_LONG__ == 4
|
||||||
|
return __builtin_bswap32(__t);
|
||||||
|
#else
|
||||||
|
return __builtin_bswap64(__t);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__revll(uint64_t __t) {
|
||||||
|
return __builtin_bswap64(__t);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* REV16 */
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rev16(uint32_t __t) {
|
||||||
|
return __ror(__rev(__t), 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rev16ll(uint64_t __t) {
|
||||||
|
return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rev16l(unsigned long __t) {
|
||||||
|
#if __SIZEOF_LONG__ == 4
|
||||||
|
return __rev16(__t);
|
||||||
|
#else
|
||||||
|
return __rev16ll(__t);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* REVSH */
|
||||||
|
static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__revsh(int16_t __t) {
|
||||||
|
return (int16_t)__builtin_bswap16((uint16_t)__t);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* RBIT */
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rbit(uint32_t __t) {
|
||||||
|
return __builtin_arm_rbit(__t);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rbitll(uint64_t __t) {
|
||||||
|
#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
|
||||||
|
return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
|
||||||
|
__builtin_arm_rbit(__t >> 32);
|
||||||
|
#else
|
||||||
|
return __builtin_arm_rbit64(__t);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rbitl(unsigned long __t) {
|
||||||
|
#if __SIZEOF_LONG__ == 4
|
||||||
|
return __rbit(__t);
|
||||||
|
#else
|
||||||
|
return __rbitll(__t);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 8.3 16-bit multiplications */
|
||||||
|
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||||
|
__smulbb(int32_t __a, int32_t __b) {
|
||||||
|
return __builtin_arm_smulbb(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||||
|
__smulbt(int32_t __a, int32_t __b) {
|
||||||
|
return __builtin_arm_smulbt(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||||
|
__smultb(int32_t __a, int32_t __b) {
|
||||||
|
return __builtin_arm_smultb(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||||
|
__smultt(int32_t __a, int32_t __b) {
|
||||||
|
return __builtin_arm_smultt(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||||
|
__smulwb(int32_t __a, int32_t __b) {
|
||||||
|
return __builtin_arm_smulwb(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
|
||||||
|
__smulwt(int32_t __a, int32_t __b) {
|
||||||
|
return __builtin_arm_smulwt(__a, __b);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 8.4 Saturating intrinsics
|
||||||
|
*
|
||||||
|
* FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
|
||||||
|
* intrinsics are implemented and the flag is enabled.
|
||||||
|
*/
|
||||||
|
/* 8.4.1 Width-specified saturation intrinsics */
|
||||||
|
#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
|
||||||
|
#define __ssat(x, y) __builtin_arm_ssat(x, y)
|
||||||
|
#define __usat(x, y) __builtin_arm_usat(x, y)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.4.2 Saturating addition and subtraction intrinsics */
|
||||||
|
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__qadd(int32_t __t, int32_t __v) {
|
||||||
|
return __builtin_arm_qadd(__t, __v);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__qsub(int32_t __t, int32_t __v) {
|
||||||
|
return __builtin_arm_qsub(__t, __v);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__qdbl(int32_t __t) {
|
||||||
|
return __builtin_arm_qadd(__t, __t);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.4.3 Accumulating multiplications */
|
||||||
|
#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlabb(int32_t __a, int32_t __b, int32_t __c) {
|
||||||
|
return __builtin_arm_smlabb(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlabt(int32_t __a, int32_t __b, int32_t __c) {
|
||||||
|
return __builtin_arm_smlabt(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlatb(int32_t __a, int32_t __b, int32_t __c) {
|
||||||
|
return __builtin_arm_smlatb(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlatt(int32_t __a, int32_t __b, int32_t __c) {
|
||||||
|
return __builtin_arm_smlatt(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlawb(int32_t __a, int32_t __b, int32_t __c) {
|
||||||
|
return __builtin_arm_smlawb(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlawt(int32_t __a, int32_t __b, int32_t __c) {
|
||||||
|
return __builtin_arm_smlawt(__a, __b, __c);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/* 8.5.4 Parallel 16-bit saturation */
|
||||||
|
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||||
|
#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
|
||||||
|
#define __usat16(x, y) __builtin_arm_usat16(x, y)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.5.5 Packing and unpacking */
|
||||||
|
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||||
|
typedef int32_t int8x4_t;
|
||||||
|
typedef int32_t int16x2_t;
|
||||||
|
typedef uint32_t uint8x4_t;
|
||||||
|
typedef uint32_t uint16x2_t;
|
||||||
|
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__sxtab16(int16x2_t __a, int8x4_t __b) {
|
||||||
|
return __builtin_arm_sxtab16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__sxtb16(int8x4_t __a) {
|
||||||
|
return __builtin_arm_sxtb16(__a);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uxtab16(int16x2_t __a, int8x4_t __b) {
|
||||||
|
return __builtin_arm_uxtab16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uxtb16(int8x4_t __a) {
|
||||||
|
return __builtin_arm_uxtb16(__a);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.5.6 Parallel selection */
|
||||||
|
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||||
|
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__sel(uint8x4_t __a, uint8x4_t __b) {
|
||||||
|
return __builtin_arm_sel(__a, __b);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.5.7 Parallel 8-bit addition and subtraction */
|
||||||
|
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||||
|
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__qadd8(int8x4_t __a, int8x4_t __b) {
|
||||||
|
return __builtin_arm_qadd8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__qsub8(int8x4_t __a, int8x4_t __b) {
|
||||||
|
return __builtin_arm_qsub8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__sadd8(int8x4_t __a, int8x4_t __b) {
|
||||||
|
return __builtin_arm_sadd8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__shadd8(int8x4_t __a, int8x4_t __b) {
|
||||||
|
return __builtin_arm_shadd8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__shsub8(int8x4_t __a, int8x4_t __b) {
|
||||||
|
return __builtin_arm_shsub8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__ssub8(int8x4_t __a, int8x4_t __b) {
|
||||||
|
return __builtin_arm_ssub8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uadd8(uint8x4_t __a, uint8x4_t __b) {
|
||||||
|
return __builtin_arm_uadd8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uhadd8(uint8x4_t __a, uint8x4_t __b) {
|
||||||
|
return __builtin_arm_uhadd8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uhsub8(uint8x4_t __a, uint8x4_t __b) {
|
||||||
|
return __builtin_arm_uhsub8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uqadd8(uint8x4_t __a, uint8x4_t __b) {
|
||||||
|
return __builtin_arm_uqadd8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uqsub8(uint8x4_t __a, uint8x4_t __b) {
|
||||||
|
return __builtin_arm_uqsub8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__usub8(uint8x4_t __a, uint8x4_t __b) {
|
||||||
|
return __builtin_arm_usub8(__a, __b);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.5.8 Sum of 8-bit absolute differences */
|
||||||
|
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__usad8(uint8x4_t __a, uint8x4_t __b) {
|
||||||
|
return __builtin_arm_usad8(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
|
||||||
|
return __builtin_arm_usada8(__a, __b, __c);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.5.9 Parallel 16-bit addition and subtraction */
|
||||||
|
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__qadd16(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_qadd16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__qasx(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_qasx(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__qsax(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_qsax(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__qsub16(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_qsub16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__sadd16(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_sadd16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__sasx(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_sasx(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__shadd16(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_shadd16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__shasx(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_shasx(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__shsax(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_shsax(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__shsub16(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_shsub16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__ssax(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_ssax(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__ssub16(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_ssub16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uadd16(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_uadd16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uasx(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_uasx(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uhadd16(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_uhadd16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uhasx(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_uhasx(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uhsax(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_uhsax(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uhsub16(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_uhsub16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uqadd16(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_uqadd16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uqasx(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_uqasx(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uqsax(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_uqsax(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__uqsub16(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_uqsub16(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__usax(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_usax(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__usub16(uint16x2_t __a, uint16x2_t __b) {
|
||||||
|
return __builtin_arm_usub16(__a, __b);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.5.10 Parallel 16-bit multiplication */
|
||||||
|
#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
||||||
|
return __builtin_arm_smlad(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
||||||
|
return __builtin_arm_smladx(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
||||||
|
return __builtin_arm_smlald(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
||||||
|
return __builtin_arm_smlaldx(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
||||||
|
return __builtin_arm_smlsd(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
|
||||||
|
return __builtin_arm_smlsdx(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
||||||
|
return __builtin_arm_smlsld(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
|
||||||
|
return __builtin_arm_smlsldx(__a, __b, __c);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smuad(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_smuad(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smuadx(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_smuadx(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smusd(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_smusd(__a, __b);
|
||||||
|
}
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__smusdx(int16x2_t __a, int16x2_t __b) {
|
||||||
|
return __builtin_arm_smusdx(__a, __b);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.6 Floating-point data-processing intrinsics */
|
||||||
|
#if (defined(__ARM_FEATURE_DIRECTED_ROUNDING) && \
|
||||||
|
(__ARM_FEATURE_DIRECTED_ROUNDING)) && \
|
||||||
|
(defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
|
||||||
|
static __inline__ double __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rintn(double __a) {
|
||||||
|
return __builtin_roundeven(__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ float __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__rintnf(float __a) {
|
||||||
|
return __builtin_roundevenf(__a);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.8 CRC32 intrinsics */
|
||||||
|
#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) || \
|
||||||
|
(defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||||
|
__crc32b(uint32_t __a, uint8_t __b) {
|
||||||
|
return __builtin_arm_crc32b(__a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||||
|
__crc32h(uint32_t __a, uint16_t __b) {
|
||||||
|
return __builtin_arm_crc32h(__a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||||
|
__crc32w(uint32_t __a, uint32_t __b) {
|
||||||
|
return __builtin_arm_crc32w(__a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||||
|
__crc32d(uint32_t __a, uint64_t __b) {
|
||||||
|
return __builtin_arm_crc32d(__a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||||
|
__crc32cb(uint32_t __a, uint8_t __b) {
|
||||||
|
return __builtin_arm_crc32cb(__a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||||
|
__crc32ch(uint32_t __a, uint16_t __b) {
|
||||||
|
return __builtin_arm_crc32ch(__a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||||
|
__crc32cw(uint32_t __a, uint32_t __b) {
|
||||||
|
return __builtin_arm_crc32cw(__a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
|
||||||
|
__crc32cd(uint32_t __a, uint64_t __b) {
|
||||||
|
return __builtin_arm_crc32cd(__a, __b);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.6 Floating-point data-processing intrinsics */
|
||||||
|
/* Armv8.3-A Javascript conversion intrinsic */
|
||||||
|
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||||
|
static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
|
||||||
|
__jcvt(double __a) {
|
||||||
|
return __builtin_arm_jcvt(__a);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Armv8.5-A FP rounding intrinsics */
|
||||||
|
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||||
|
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||||
|
__rint32zf(float __a) {
|
||||||
|
return __builtin_arm_rint32zf(__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||||
|
__rint32z(double __a) {
|
||||||
|
return __builtin_arm_rint32z(__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||||
|
__rint64zf(float __a) {
|
||||||
|
return __builtin_arm_rint64zf(__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||||
|
__rint64z(double __a) {
|
||||||
|
return __builtin_arm_rint64z(__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||||
|
__rint32xf(float __a) {
|
||||||
|
return __builtin_arm_rint32xf(__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||||
|
__rint32x(double __a) {
|
||||||
|
return __builtin_arm_rint32x(__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||||
|
__rint64xf(float __a) {
|
||||||
|
return __builtin_arm_rint64xf(__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
|
||||||
|
__rint64x(double __a) {
|
||||||
|
return __builtin_arm_rint64x(__a);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 8.9 Armv8.7-A load/store 64-byte intrinsics */
|
||||||
|
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||||
|
typedef struct {
|
||||||
|
uint64_t val[8];
|
||||||
|
} data512_t;
|
||||||
|
|
||||||
|
static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
||||||
|
__arm_ld64b(const void *__addr) {
|
||||||
|
data512_t __value;
|
||||||
|
__builtin_arm_ld64b(__addr, __value.val);
|
||||||
|
return __value;
|
||||||
|
}
|
||||||
|
static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
||||||
|
__arm_st64b(void *__addr, data512_t __value) {
|
||||||
|
__builtin_arm_st64b(__addr, __value.val);
|
||||||
|
}
|
||||||
|
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
||||||
|
__arm_st64bv(void *__addr, data512_t __value) {
|
||||||
|
return __builtin_arm_st64bv(__addr, __value.val);
|
||||||
|
}
|
||||||
|
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
|
||||||
|
__arm_st64bv0(void *__addr, data512_t __value) {
|
||||||
|
return __builtin_arm_st64bv0(__addr, __value.val);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 11.1 Special register intrinsics */
|
||||||
|
#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
|
||||||
|
#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
|
||||||
|
#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
|
||||||
|
#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
|
||||||
|
#define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
|
||||||
|
#define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
|
||||||
|
#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
|
||||||
|
#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
|
||||||
|
#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
|
||||||
|
#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
|
||||||
|
#define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
|
||||||
|
#define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
|
||||||
|
|
||||||
|
/* 10.3 MTE intrinsics */
|
||||||
|
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||||
|
#define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask)
|
||||||
|
#define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset)
|
||||||
|
#define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded)
|
||||||
|
#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
|
||||||
|
#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
|
||||||
|
#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
|
||||||
|
|
||||||
|
/* 18 memcpy family of operations intrinsics - MOPS */
|
||||||
|
#define __arm_mops_memset_tag(__tagged_address, __value, __size) \
|
||||||
|
__builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 11.3 Coprocessor Intrinsics */
|
||||||
|
#if defined(__ARM_FEATURE_COPROC)
|
||||||
|
|
||||||
|
#if (__ARM_FEATURE_COPROC & 0x1)
|
||||||
|
|
||||||
|
#if (__ARM_ARCH < 8)
|
||||||
|
#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \
|
||||||
|
__builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
|
||||||
|
#endif /* __ARM_ARCH < 8 */
|
||||||
|
|
||||||
|
#define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p)
|
||||||
|
#define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p)
|
||||||
|
|
||||||
|
#define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2) \
|
||||||
|
__builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2)
|
||||||
|
#define __arm_mrc(coproc, opc1, CRn, CRm, opc2) \
|
||||||
|
__builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2)
|
||||||
|
|
||||||
|
#if (__ARM_ARCH != 4) && (__ARM_ARCH < 8)
|
||||||
|
#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
|
||||||
|
#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
|
||||||
|
#endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */
|
||||||
|
|
||||||
|
#if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__)
|
||||||
|
#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \
|
||||||
|
__builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
|
||||||
|
#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
|
||||||
|
#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
|
||||||
|
#endif /* ___ARM_ARCH_8M_MAIN__ */
|
||||||
|
|
||||||
|
#endif /* __ARM_FEATURE_COPROC & 0x1 */
|
||||||
|
|
||||||
|
#if (__ARM_FEATURE_COPROC & 0x2)
|
||||||
|
#define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2) \
|
||||||
|
__builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)
|
||||||
|
#define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p)
|
||||||
|
#define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p)
|
||||||
|
#define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p)
|
||||||
|
#define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p)
|
||||||
|
#define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2) \
|
||||||
|
__builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)
|
||||||
|
#define __arm_mrc2(coproc, opc1, CRn, CRm, opc2) \
|
||||||
|
__builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (__ARM_FEATURE_COPROC & 0x4)
|
||||||
|
#define __arm_mcrr(coproc, opc1, value, CRm) \
|
||||||
|
__builtin_arm_mcrr(coproc, opc1, value, CRm)
|
||||||
|
#define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if (__ARM_FEATURE_COPROC & 0x8)
|
||||||
|
#define __arm_mcrr2(coproc, opc1, value, CRm) \
|
||||||
|
__builtin_arm_mcrr2(coproc, opc1, value, CRm)
|
||||||
|
#define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // __ARM_FEATURE_COPROC
|
||||||
|
|
||||||
|
/* 17 Transactional Memory Extension (TME) Intrinsics */
|
||||||
|
#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME
|
||||||
|
|
||||||
|
#define _TMFAILURE_REASON 0x00007fffu
|
||||||
|
#define _TMFAILURE_RTRY 0x00008000u
|
||||||
|
#define _TMFAILURE_CNCL 0x00010000u
|
||||||
|
#define _TMFAILURE_MEM 0x00020000u
|
||||||
|
#define _TMFAILURE_IMP 0x00040000u
|
||||||
|
#define _TMFAILURE_ERR 0x00080000u
|
||||||
|
#define _TMFAILURE_SIZE 0x00100000u
|
||||||
|
#define _TMFAILURE_NEST 0x00200000u
|
||||||
|
#define _TMFAILURE_DBG 0x00400000u
|
||||||
|
#define _TMFAILURE_INT 0x00800000u
|
||||||
|
#define _TMFAILURE_TRIVIAL 0x01000000u
|
||||||
|
|
||||||
|
#define __tstart() __builtin_arm_tstart()
|
||||||
|
#define __tcommit() __builtin_arm_tcommit()
|
||||||
|
#define __tcancel(__arg) __builtin_arm_tcancel(__arg)
|
||||||
|
#define __ttest() __builtin_arm_ttest()
|
||||||
|
|
||||||
|
#endif /* __ARM_FEATURE_TME */
|
||||||
|
|
||||||
|
/* 8.7 Armv8.5-A Random number generation intrinsics */
|
||||||
|
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||||
|
static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
|
||||||
|
__rndr(uint64_t *__p) {
|
||||||
|
return __builtin_arm_rndr(__p);
|
||||||
|
}
|
||||||
|
static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
|
||||||
|
__rndrrs(uint64_t *__p) {
|
||||||
|
return __builtin_arm_rndrrs(__p);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* 11.2 Guarded Control Stack intrinsics */
|
||||||
|
#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
|
||||||
|
static __inline__ void * __attribute__((__always_inline__, __nodebug__))
|
||||||
|
__gcspr() {
|
||||||
|
return (void *)__builtin_arm_rsr64("gcspr_el0");
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("gcs")))
|
||||||
|
__gcspopm() {
|
||||||
|
return __builtin_arm_gcspopm(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ const void * __attribute__((__always_inline__, __nodebug__, target("gcs")))
|
||||||
|
__gcsss(const void *__stack) {
|
||||||
|
return __builtin_arm_gcsss(__stack);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* __ARM_ACLE_H */
|
20
third_party/aarch64/clang/arm_bf16.h
vendored
Normal file
20
third_party/aarch64/clang/arm_bf16.h
vendored
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
/*===---- arm_bf16.h - ARM BF16 intrinsics -----------------------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __ARM_BF16_H
|
||||||
|
#define __ARM_BF16_H
|
||||||
|
|
||||||
|
typedef __bf16 bfloat16_t;
|
||||||
|
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
|
||||||
|
|
||||||
|
|
||||||
|
#undef __ai
|
||||||
|
|
||||||
|
#endif
|
410
third_party/aarch64/clang/arm_cde.h
vendored
Normal file
410
third_party/aarch64/clang/arm_cde.h
vendored
Normal file
|
@ -0,0 +1,410 @@
|
||||||
|
/*===---- arm_cde.h - ARM CDE intrinsics -----------------------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __ARM_CDE_H
|
||||||
|
#define __ARM_CDE_H
|
||||||
|
|
||||||
|
#if !__ARM_FEATURE_CDE
|
||||||
|
#error "CDE support not enabled"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1)))
|
||||||
|
uint32_t __arm_cx1(int, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1a)))
|
||||||
|
uint32_t __arm_cx1a(int, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1d)))
|
||||||
|
uint64_t __arm_cx1d(int, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1da)))
|
||||||
|
uint64_t __arm_cx1da(int, uint64_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2)))
|
||||||
|
uint32_t __arm_cx2(int, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2a)))
|
||||||
|
uint32_t __arm_cx2a(int, uint32_t, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2d)))
|
||||||
|
uint64_t __arm_cx2d(int, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2da)))
|
||||||
|
uint64_t __arm_cx2da(int, uint64_t, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3)))
|
||||||
|
uint32_t __arm_cx3(int, uint32_t, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3a)))
|
||||||
|
uint32_t __arm_cx3a(int, uint32_t, uint32_t, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3d)))
|
||||||
|
uint64_t __arm_cx3d(int, uint32_t, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3da)))
|
||||||
|
uint64_t __arm_cx3da(int, uint64_t, uint32_t, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1_u32)))
|
||||||
|
uint32_t __arm_vcx1_u32(int, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1a_u32)))
|
||||||
|
uint32_t __arm_vcx1a_u32(int, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1d_u64)))
|
||||||
|
uint64_t __arm_vcx1d_u64(int, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1da_u64)))
|
||||||
|
uint64_t __arm_vcx1da_u64(int, uint64_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2_u32)))
|
||||||
|
uint32_t __arm_vcx2_u32(int, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2a_u32)))
|
||||||
|
uint32_t __arm_vcx2a_u32(int, uint32_t, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2d_u64)))
|
||||||
|
uint64_t __arm_vcx2d_u64(int, uint64_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2da_u64)))
|
||||||
|
uint64_t __arm_vcx2da_u64(int, uint64_t, uint64_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3_u32)))
|
||||||
|
uint32_t __arm_vcx3_u32(int, uint32_t, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3a_u32)))
|
||||||
|
uint32_t __arm_vcx3a_u32(int, uint32_t, uint32_t, uint32_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3d_u64)))
|
||||||
|
uint64_t __arm_vcx3d_u64(int, uint64_t, uint64_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3da_u64)))
|
||||||
|
uint64_t __arm_vcx3da_u64(int, uint64_t, uint64_t, uint64_t, uint32_t);
|
||||||
|
|
||||||
|
#if __ARM_FEATURE_MVE
|
||||||
|
|
||||||
|
typedef uint16_t mve_pred16_t;
|
||||||
|
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) int16_t int16x8_t;
|
||||||
|
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) int32_t int32x4_t;
|
||||||
|
typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) int64_t int64x2_t;
|
||||||
|
typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) int8_t int8x16_t;
|
||||||
|
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) uint16_t uint16x8_t;
|
||||||
|
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) uint32_t uint32x4_t;
|
||||||
|
typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) uint64_t uint64x2_t;
|
||||||
|
typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) uint8_t uint8x16_t;
|
||||||
|
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s16)))
|
||||||
|
int16x8_t __arm_vcx1q_m(int, int16x8_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s32)))
|
||||||
|
int32x4_t __arm_vcx1q_m(int, int32x4_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s64)))
|
||||||
|
int64x2_t __arm_vcx1q_m(int, int64x2_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s8)))
|
||||||
|
int8x16_t __arm_vcx1q_m(int, int8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u16)))
|
||||||
|
uint16x8_t __arm_vcx1q_m(int, uint16x8_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u32)))
|
||||||
|
uint32x4_t __arm_vcx1q_m(int, uint32x4_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u64)))
|
||||||
|
uint64x2_t __arm_vcx1q_m(int, uint64x2_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u8)))
|
||||||
|
uint8x16_t __arm_vcx1q_m(int, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_u8)))
|
||||||
|
uint8x16_t __arm_vcx1q_u8(int, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s16)))
|
||||||
|
int16x8_t __arm_vcx1qa_m(int, int16x8_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s32)))
|
||||||
|
int32x4_t __arm_vcx1qa_m(int, int32x4_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s64)))
|
||||||
|
int64x2_t __arm_vcx1qa_m(int, int64x2_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s8)))
|
||||||
|
int8x16_t __arm_vcx1qa_m(int, int8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u16)))
|
||||||
|
uint16x8_t __arm_vcx1qa_m(int, uint16x8_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u32)))
|
||||||
|
uint32x4_t __arm_vcx1qa_m(int, uint32x4_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u64)))
|
||||||
|
uint64x2_t __arm_vcx1qa_m(int, uint64x2_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u8)))
|
||||||
|
uint8x16_t __arm_vcx1qa_m(int, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s16)))
|
||||||
|
int16x8_t __arm_vcx1qa(int, int16x8_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s32)))
|
||||||
|
int32x4_t __arm_vcx1qa(int, int32x4_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s64)))
|
||||||
|
int64x2_t __arm_vcx1qa(int, int64x2_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s8)))
|
||||||
|
int8x16_t __arm_vcx1qa(int, int8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u16)))
|
||||||
|
uint16x8_t __arm_vcx1qa(int, uint16x8_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u32)))
|
||||||
|
uint32x4_t __arm_vcx1qa(int, uint32x4_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u64)))
|
||||||
|
uint64x2_t __arm_vcx1qa(int, uint64x2_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u8)))
|
||||||
|
uint8x16_t __arm_vcx1qa(int, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s16)))
|
||||||
|
int16x8_t __arm_vcx2q_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s32)))
|
||||||
|
int32x4_t __arm_vcx2q_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s64)))
|
||||||
|
int64x2_t __arm_vcx2q_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s8)))
|
||||||
|
int8x16_t __arm_vcx2q_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u16)))
|
||||||
|
uint16x8_t __arm_vcx2q_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u32)))
|
||||||
|
uint32x4_t __arm_vcx2q_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u64)))
|
||||||
|
uint64x2_t __arm_vcx2q_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u8)))
|
||||||
|
uint8x16_t __arm_vcx2q_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s16)))
|
||||||
|
int16x8_t __arm_vcx2q(int, int16x8_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s32)))
|
||||||
|
int32x4_t __arm_vcx2q(int, int32x4_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s64)))
|
||||||
|
int64x2_t __arm_vcx2q(int, int64x2_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s8)))
|
||||||
|
int8x16_t __arm_vcx2q(int, int8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u16)))
|
||||||
|
uint16x8_t __arm_vcx2q(int, uint16x8_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u32)))
|
||||||
|
uint32x4_t __arm_vcx2q(int, uint32x4_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u64)))
|
||||||
|
uint64x2_t __arm_vcx2q(int, uint64x2_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8)))
|
||||||
|
uint8x16_t __arm_vcx2q(int, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s16)))
|
||||||
|
uint8x16_t __arm_vcx2q_u8(int, int16x8_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s32)))
|
||||||
|
uint8x16_t __arm_vcx2q_u8(int, int32x4_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s64)))
|
||||||
|
uint8x16_t __arm_vcx2q_u8(int, int64x2_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s8)))
|
||||||
|
uint8x16_t __arm_vcx2q_u8(int, int8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u16)))
|
||||||
|
uint8x16_t __arm_vcx2q_u8(int, uint16x8_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u32)))
|
||||||
|
uint8x16_t __arm_vcx2q_u8(int, uint32x4_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u64)))
|
||||||
|
uint8x16_t __arm_vcx2q_u8(int, uint64x2_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u8)))
|
||||||
|
uint8x16_t __arm_vcx2q_u8(int, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s16)))
|
||||||
|
int16x8_t __arm_vcx2qa_impl(int, int16x8_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s32)))
|
||||||
|
int32x4_t __arm_vcx2qa_impl(int, int32x4_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s64)))
|
||||||
|
int64x2_t __arm_vcx2qa_impl(int, int64x2_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s8)))
|
||||||
|
int8x16_t __arm_vcx2qa_impl(int, int8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u16)))
|
||||||
|
uint16x8_t __arm_vcx2qa_impl(int, uint16x8_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u32)))
|
||||||
|
uint32x4_t __arm_vcx2qa_impl(int, uint32x4_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u64)))
|
||||||
|
uint64x2_t __arm_vcx2qa_impl(int, uint64x2_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u8)))
|
||||||
|
uint8x16_t __arm_vcx2qa_impl(int, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s16)))
|
||||||
|
int16x8_t __arm_vcx2qa_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s32)))
|
||||||
|
int32x4_t __arm_vcx2qa_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s64)))
|
||||||
|
int64x2_t __arm_vcx2qa_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s8)))
|
||||||
|
int8x16_t __arm_vcx2qa_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u16)))
|
||||||
|
uint16x8_t __arm_vcx2qa_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u32)))
|
||||||
|
uint32x4_t __arm_vcx2qa_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u64)))
|
||||||
|
uint64x2_t __arm_vcx2qa_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u8)))
|
||||||
|
uint8x16_t __arm_vcx2qa_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s16)))
|
||||||
|
int16x8_t __arm_vcx3q_impl(int, int16x8_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s32)))
|
||||||
|
int32x4_t __arm_vcx3q_impl(int, int32x4_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s64)))
|
||||||
|
int64x2_t __arm_vcx3q_impl(int, int64x2_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s8)))
|
||||||
|
int8x16_t __arm_vcx3q_impl(int, int8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u16)))
|
||||||
|
uint16x8_t __arm_vcx3q_impl(int, uint16x8_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u32)))
|
||||||
|
uint32x4_t __arm_vcx3q_impl(int, uint32x4_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u64)))
|
||||||
|
uint64x2_t __arm_vcx3q_impl(int, uint64x2_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u8)))
|
||||||
|
uint8x16_t __arm_vcx3q_impl(int, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s16)))
|
||||||
|
int16x8_t __arm_vcx3q_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s32)))
|
||||||
|
int32x4_t __arm_vcx3q_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s64)))
|
||||||
|
int64x2_t __arm_vcx3q_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s8)))
|
||||||
|
int8x16_t __arm_vcx3q_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u16)))
|
||||||
|
uint16x8_t __arm_vcx3q_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u32)))
|
||||||
|
uint32x4_t __arm_vcx3q_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u64)))
|
||||||
|
uint64x2_t __arm_vcx3q_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u8)))
|
||||||
|
uint8x16_t __arm_vcx3q_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s16)))
|
||||||
|
uint8x16_t __arm_vcx3q_u8_impl(int, int16x8_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s32)))
|
||||||
|
uint8x16_t __arm_vcx3q_u8_impl(int, int32x4_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s64)))
|
||||||
|
uint8x16_t __arm_vcx3q_u8_impl(int, int64x2_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s8)))
|
||||||
|
uint8x16_t __arm_vcx3q_u8_impl(int, int8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u16)))
|
||||||
|
uint8x16_t __arm_vcx3q_u8_impl(int, uint16x8_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u32)))
|
||||||
|
uint8x16_t __arm_vcx3q_u8_impl(int, uint32x4_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u64)))
|
||||||
|
uint8x16_t __arm_vcx3q_u8_impl(int, uint64x2_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u8)))
|
||||||
|
uint8x16_t __arm_vcx3q_u8_impl(int, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s16)))
|
||||||
|
int16x8_t __arm_vcx3qa_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s32)))
|
||||||
|
int32x4_t __arm_vcx3qa_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s64)))
|
||||||
|
int64x2_t __arm_vcx3qa_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s8)))
|
||||||
|
int8x16_t __arm_vcx3qa_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u16)))
|
||||||
|
uint16x8_t __arm_vcx3qa_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u32)))
|
||||||
|
uint32x4_t __arm_vcx3qa_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u64)))
|
||||||
|
uint64x2_t __arm_vcx3qa_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u8)))
|
||||||
|
uint8x16_t __arm_vcx3qa_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s16)))
|
||||||
|
int16x8_t __arm_vcx3qa_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s32)))
|
||||||
|
int32x4_t __arm_vcx3qa_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s64)))
|
||||||
|
int64x2_t __arm_vcx3qa_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s8)))
|
||||||
|
int8x16_t __arm_vcx3qa_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u16)))
|
||||||
|
uint16x8_t __arm_vcx3qa_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u32)))
|
||||||
|
uint32x4_t __arm_vcx3qa_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u64)))
|
||||||
|
uint64x2_t __arm_vcx3qa_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u8)))
|
||||||
|
uint8x16_t __arm_vcx3qa_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
|
||||||
|
int16x8_t __arm_vreinterpretq_s16_u8(uint8x16_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
|
||||||
|
int32x4_t __arm_vreinterpretq_s32_u8(uint8x16_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
|
||||||
|
int64x2_t __arm_vreinterpretq_s64_u8(uint8x16_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
|
||||||
|
int8x16_t __arm_vreinterpretq_s8_u8(uint8x16_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
|
||||||
|
uint16x8_t __arm_vreinterpretq_u16_u8(uint8x16_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
|
||||||
|
uint32x4_t __arm_vreinterpretq_u32_u8(uint8x16_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
|
||||||
|
uint64x2_t __arm_vreinterpretq_u64_u8(uint8x16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
|
||||||
|
uint8x16_t __arm_vreinterpretq_u8(int16x8_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
|
||||||
|
uint8x16_t __arm_vreinterpretq_u8(int32x4_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
|
||||||
|
uint8x16_t __arm_vreinterpretq_u8(int64x2_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
|
||||||
|
uint8x16_t __arm_vreinterpretq_u8(int8x16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
|
||||||
|
uint8x16_t __arm_vreinterpretq_u8(uint16x8_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
|
||||||
|
uint8x16_t __arm_vreinterpretq_u8(uint32x4_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
|
||||||
|
uint8x16_t __arm_vreinterpretq_u8(uint64x2_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vreinterpretq_u8_u8)))
|
||||||
|
uint8x16_t __arm_vreinterpretq_u8(uint8x16_t);
|
||||||
|
#define __arm_vcx2q_m(cp, inactive, n, imm, pred) __arm_vcx2q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), (imm), (pred))
|
||||||
|
#define __arm_vcx2qa(cp, acc, n, imm) __arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))
|
||||||
|
#define __arm_vcx2qa_m(cp, acc, n, imm, pred) __arm_vcx2qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm), (pred))
|
||||||
|
#define __arm_vcx3q(cp, n, m, imm) __arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
|
||||||
|
#define __arm_vcx3q_m(cp, inactive, n, m, imm, pred) __arm_vcx3q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
|
||||||
|
#define __arm_vcx3q_u8(cp, n, m, imm) __arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
|
||||||
|
#define __arm_vcx3qa(cp, acc, n, m, imm) __arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm))
|
||||||
|
#define __arm_vcx3qa_m(cp, acc, n, m, imm, pred) __arm_vcx3qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
|
||||||
|
|
||||||
|
#endif /* __ARM_FEATURE_MVE */
|
||||||
|
|
||||||
|
#if __ARM_FEATURE_MVE & 2
|
||||||
|
|
||||||
|
typedef __fp16 float16_t;
|
||||||
|
typedef float float32_t;
|
||||||
|
typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) float16_t float16x8_t;
|
||||||
|
typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) float32_t float32x4_t;
|
||||||
|
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f16)))
|
||||||
|
float16x8_t __arm_vcx1q_m(int, float16x8_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f32)))
|
||||||
|
float32x4_t __arm_vcx1q_m(int, float32x4_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f16)))
|
||||||
|
float16x8_t __arm_vcx1qa(int, float16x8_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f32)))
|
||||||
|
float32x4_t __arm_vcx1qa(int, float32x4_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f16)))
|
||||||
|
float16x8_t __arm_vcx1qa_m(int, float16x8_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f32)))
|
||||||
|
float32x4_t __arm_vcx1qa_m(int, float32x4_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f16)))
|
||||||
|
float16x8_t __arm_vcx2q(int, float16x8_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f32)))
|
||||||
|
float32x4_t __arm_vcx2q(int, float32x4_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f16)))
|
||||||
|
float16x8_t __arm_vcx2q_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f32)))
|
||||||
|
float32x4_t __arm_vcx2q_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f16)))
|
||||||
|
uint8x16_t __arm_vcx2q_u8(int, float16x8_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f32)))
|
||||||
|
uint8x16_t __arm_vcx2q_u8(int, float32x4_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f16)))
|
||||||
|
float16x8_t __arm_vcx2qa_impl(int, float16x8_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f32)))
|
||||||
|
float32x4_t __arm_vcx2qa_impl(int, float32x4_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f16)))
|
||||||
|
float16x8_t __arm_vcx2qa_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f32)))
|
||||||
|
float32x4_t __arm_vcx2qa_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f16)))
|
||||||
|
float16x8_t __arm_vcx3q_impl(int, float16x8_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f32)))
|
||||||
|
float32x4_t __arm_vcx3q_impl(int, float32x4_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f16)))
|
||||||
|
float16x8_t __arm_vcx3q_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f32)))
|
||||||
|
float32x4_t __arm_vcx3q_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f16)))
|
||||||
|
uint8x16_t __arm_vcx3q_u8_impl(int, float16x8_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f32)))
|
||||||
|
uint8x16_t __arm_vcx3q_u8_impl(int, float32x4_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f16)))
|
||||||
|
float16x8_t __arm_vcx3qa_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f32)))
|
||||||
|
float32x4_t __arm_vcx3qa_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f16)))
|
||||||
|
float16x8_t __arm_vcx3qa_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f32)))
|
||||||
|
float32x4_t __arm_vcx3qa_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
|
||||||
|
float16x8_t __arm_vreinterpretq_f16_u8(uint8x16_t);
|
||||||
|
static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
|
||||||
|
float32x4_t __arm_vreinterpretq_f32_u8(uint8x16_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
|
||||||
|
uint8x16_t __arm_vreinterpretq_u8(float16x8_t);
|
||||||
|
static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
|
||||||
|
uint8x16_t __arm_vreinterpretq_u8(float32x4_t);
|
||||||
|
|
||||||
|
#endif /* __ARM_FEATURE_MVE & 2 */
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} /* extern "C" */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* __ARM_CDE_H */
|
217
third_party/aarch64/clang/arm_cmse.h
vendored
Normal file
217
third_party/aarch64/clang/arm_cmse.h
vendored
Normal file
|
@ -0,0 +1,217 @@
|
||||||
|
//===---- arm_cmse.h - Arm CMSE support -----------------------------------===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef __ARM_CMSE_H
|
||||||
|
#define __ARM_CMSE_H
|
||||||
|
|
||||||
|
#if (__ARM_FEATURE_CMSE & 0x1)
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#define __ARM_CMSE_SECURE_MODE (__ARM_FEATURE_CMSE & 0x2)
|
||||||
|
#define CMSE_MPU_READWRITE 1 /* checks if readwrite_ok field is set */
|
||||||
|
#define CMSE_AU_NONSECURE 2 /* checks if permissions have secure field unset */
|
||||||
|
#define CMSE_MPU_UNPRIV 4 /* sets T flag on TT insrtuction */
|
||||||
|
#define CMSE_MPU_READ 8 /* checks if read_ok field is set */
|
||||||
|
#define CMSE_MPU_NONSECURE 16 /* sets A flag, checks if secure field unset */
|
||||||
|
#define CMSE_NONSECURE (CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE)
|
||||||
|
|
||||||
|
#define cmse_check_pointed_object(p, f) \
|
||||||
|
cmse_check_address_range((p), sizeof(*(p)), (f))
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef union {
|
||||||
|
struct cmse_address_info {
|
||||||
|
#ifdef __ARM_BIG_ENDIAN
|
||||||
|
/* __ARM_BIG_ENDIAN */
|
||||||
|
#if (__ARM_CMSE_SECURE_MODE)
|
||||||
|
unsigned idau_region : 8;
|
||||||
|
unsigned idau_region_valid : 1;
|
||||||
|
unsigned secure : 1;
|
||||||
|
unsigned nonsecure_readwrite_ok : 1;
|
||||||
|
unsigned nonsecure_read_ok : 1;
|
||||||
|
#else
|
||||||
|
unsigned : 12;
|
||||||
|
#endif
|
||||||
|
unsigned readwrite_ok : 1;
|
||||||
|
unsigned read_ok : 1;
|
||||||
|
#if (__ARM_CMSE_SECURE_MODE)
|
||||||
|
unsigned sau_region_valid : 1;
|
||||||
|
#else
|
||||||
|
unsigned : 1;
|
||||||
|
#endif
|
||||||
|
unsigned mpu_region_valid : 1;
|
||||||
|
#if (__ARM_CMSE_SECURE_MODE)
|
||||||
|
unsigned sau_region : 8;
|
||||||
|
#else
|
||||||
|
unsigned : 8;
|
||||||
|
#endif
|
||||||
|
unsigned mpu_region : 8;
|
||||||
|
|
||||||
|
#else /* __ARM_LITTLE_ENDIAN */
|
||||||
|
unsigned mpu_region : 8;
|
||||||
|
#if (__ARM_CMSE_SECURE_MODE)
|
||||||
|
unsigned sau_region : 8;
|
||||||
|
#else
|
||||||
|
unsigned : 8;
|
||||||
|
#endif
|
||||||
|
unsigned mpu_region_valid : 1;
|
||||||
|
#if (__ARM_CMSE_SECURE_MODE)
|
||||||
|
unsigned sau_region_valid : 1;
|
||||||
|
#else
|
||||||
|
unsigned : 1;
|
||||||
|
#endif
|
||||||
|
unsigned read_ok : 1;
|
||||||
|
unsigned readwrite_ok : 1;
|
||||||
|
#if (__ARM_CMSE_SECURE_MODE)
|
||||||
|
unsigned nonsecure_read_ok : 1;
|
||||||
|
unsigned nonsecure_readwrite_ok : 1;
|
||||||
|
unsigned secure : 1;
|
||||||
|
unsigned idau_region_valid : 1;
|
||||||
|
unsigned idau_region : 8;
|
||||||
|
#else
|
||||||
|
unsigned : 12;
|
||||||
|
#endif
|
||||||
|
#endif /*__ARM_LITTLE_ENDIAN */
|
||||||
|
} flags;
|
||||||
|
unsigned value;
|
||||||
|
} cmse_address_info_t;
|
||||||
|
|
||||||
|
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
cmse_TT(void *__p) {
|
||||||
|
cmse_address_info_t __u;
|
||||||
|
__u.value = __builtin_arm_cmse_TT(__p);
|
||||||
|
return __u;
|
||||||
|
}
|
||||||
|
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
cmse_TTT(void *__p) {
|
||||||
|
cmse_address_info_t __u;
|
||||||
|
__u.value = __builtin_arm_cmse_TTT(__p);
|
||||||
|
return __u;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if __ARM_CMSE_SECURE_MODE
|
||||||
|
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
cmse_TTA(void *__p) {
|
||||||
|
cmse_address_info_t __u;
|
||||||
|
__u.value = __builtin_arm_cmse_TTA(__p);
|
||||||
|
return __u;
|
||||||
|
}
|
||||||
|
static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
|
||||||
|
cmse_TTAT(void *__p) {
|
||||||
|
cmse_address_info_t __u;
|
||||||
|
__u.value = __builtin_arm_cmse_TTAT(__p);
|
||||||
|
return __u;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define cmse_TT_fptr(p) cmse_TT(__builtin_bit_cast(void *, (p)))
|
||||||
|
#define cmse_TTT_fptr(p) cmse_TTT(__builtin_bit_cast(void *, (p)))
|
||||||
|
|
||||||
|
#if __ARM_CMSE_SECURE_MODE
|
||||||
|
#define cmse_TTA_fptr(p) cmse_TTA(__builtin_bit_cast(void *, (p)))
|
||||||
|
#define cmse_TTAT_fptr(p) cmse_TTAT(__builtin_bit_cast(void *, (p)))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static void *__attribute__((__always_inline__))
|
||||||
|
cmse_check_address_range(void *__pb, size_t __s, int __flags) {
|
||||||
|
uintptr_t __begin = (uintptr_t)__pb;
|
||||||
|
uintptr_t __end = __begin + __s - 1;
|
||||||
|
|
||||||
|
if (__end < __begin)
|
||||||
|
return NULL; /* wrap around check */
|
||||||
|
|
||||||
|
/* Check whether the range crosses a 32-bytes aligned address */
|
||||||
|
const int __single_check = (__begin ^ __end) < 0x20u;
|
||||||
|
|
||||||
|
/* execute the right variant of the TT instructions */
|
||||||
|
void *__pe = (void *)__end;
|
||||||
|
cmse_address_info_t __permb, __perme;
|
||||||
|
switch (__flags & (CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
|
||||||
|
case 0:
|
||||||
|
__permb = cmse_TT(__pb);
|
||||||
|
__perme = __single_check ? __permb : cmse_TT(__pe);
|
||||||
|
break;
|
||||||
|
case CMSE_MPU_UNPRIV:
|
||||||
|
__permb = cmse_TTT(__pb);
|
||||||
|
__perme = __single_check ? __permb : cmse_TTT(__pe);
|
||||||
|
break;
|
||||||
|
#if __ARM_CMSE_SECURE_MODE
|
||||||
|
case CMSE_MPU_NONSECURE:
|
||||||
|
__permb = cmse_TTA(__pb);
|
||||||
|
__perme = __single_check ? __permb : cmse_TTA(__pe);
|
||||||
|
break;
|
||||||
|
case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
|
||||||
|
__permb = cmse_TTAT(__pb);
|
||||||
|
__perme = __single_check ? __permb : cmse_TTAT(__pe);
|
||||||
|
break;
|
||||||
|
#endif
|
||||||
|
/* if CMSE_NONSECURE is specified w/o __ARM_CMSE_SECURE_MODE */
|
||||||
|
default:
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* check that the range does not cross MPU, SAU, or IDAU region boundaries */
|
||||||
|
if (__permb.value != __perme.value)
|
||||||
|
return NULL;
|
||||||
|
#if !(__ARM_CMSE_SECURE_MODE)
|
||||||
|
/* CMSE_AU_NONSECURE is only supported when __ARM_FEATURE_CMSE & 0x2 */
|
||||||
|
if (__flags & CMSE_AU_NONSECURE)
|
||||||
|
return NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* check the permission on the range */
|
||||||
|
switch (__flags & ~(CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
|
||||||
|
#if (__ARM_CMSE_SECURE_MODE)
|
||||||
|
case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
|
||||||
|
case CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
|
||||||
|
return __permb.flags.nonsecure_readwrite_ok ? __pb : NULL;
|
||||||
|
|
||||||
|
case CMSE_MPU_READ | CMSE_AU_NONSECURE:
|
||||||
|
return __permb.flags.nonsecure_read_ok ? __pb : NULL;
|
||||||
|
|
||||||
|
case CMSE_AU_NONSECURE:
|
||||||
|
return __permb.flags.secure ? NULL : __pb;
|
||||||
|
#endif
|
||||||
|
case CMSE_MPU_READ | CMSE_MPU_READWRITE:
|
||||||
|
case CMSE_MPU_READWRITE:
|
||||||
|
return __permb.flags.readwrite_ok ? __pb : NULL;
|
||||||
|
|
||||||
|
case CMSE_MPU_READ:
|
||||||
|
return __permb.flags.read_ok ? __pb : NULL;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if __ARM_CMSE_SECURE_MODE
|
||||||
|
static int __attribute__((__always_inline__, __nodebug__))
|
||||||
|
cmse_nonsecure_caller(void) {
|
||||||
|
return !((uintptr_t)__builtin_return_address(0) & 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define cmse_nsfptr_create(p) \
|
||||||
|
__builtin_bit_cast(__typeof__(p), \
|
||||||
|
(__builtin_bit_cast(uintptr_t, p) & ~(uintptr_t)1))
|
||||||
|
|
||||||
|
#define cmse_is_nsfptr(p) ((__builtin_bit_cast(uintptr_t, p) & 1) == 0)
|
||||||
|
|
||||||
|
#endif /* __ARM_CMSE_SECURE_MODE */
|
||||||
|
|
||||||
|
void __attribute__((__noreturn__)) cmse_abort(void);
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* (__ARM_FEATURE_CMSE & 0x1) */
|
||||||
|
|
||||||
|
#endif /* __ARM_CMSE_H */
|
596
third_party/aarch64/clang/arm_fp16.h
vendored
Normal file
596
third_party/aarch64/clang/arm_fp16.h
vendored
Normal file
|
@ -0,0 +1,596 @@
|
||||||
|
/*===---- arm_fp16.h - ARM FP16 intrinsics ---------------------------------===
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in
|
||||||
|
* all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
* THE SOFTWARE.
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __ARM_FP16_H
|
||||||
|
#define __ARM_FP16_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
typedef __fp16 float16_t;
|
||||||
|
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
|
||||||
|
|
||||||
|
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||||
|
#define vabdh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vabdh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vabsh_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vabsh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vaddh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vaddh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcageh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcageh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcagth_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcagth_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcaleh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcaleh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcalth_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcalth_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vceqh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vceqh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vceqzh_f16(__p0) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vceqzh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcgeh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcgeh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcgezh_f16(__p0) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcgezh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcgth_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcgth_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcgtzh_f16(__p0) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcgtzh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcleh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcleh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vclezh_f16(__p0) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vclezh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vclth_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vclth_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcltzh_f16(__p0) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcltzh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
int16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int16_t) __builtin_neon_vcvth_n_s16_f16(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
int32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int32_t) __builtin_neon_vcvth_n_s32_f16(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
int64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int64_t) __builtin_neon_vcvth_n_s64_f16(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcvth_n_u16_f16(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint32_t) __builtin_neon_vcvth_n_u32_f16(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
uint64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint64_t) __builtin_neon_vcvth_n_u64_f16(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_s16_f16(__p0) __extension__ ({ \
|
||||||
|
int16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int16_t) __builtin_neon_vcvth_s16_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_s32_f16(__p0) __extension__ ({ \
|
||||||
|
int32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int32_t) __builtin_neon_vcvth_s32_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_s64_f16(__p0) __extension__ ({ \
|
||||||
|
int64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int64_t) __builtin_neon_vcvth_s64_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_u16_f16(__p0) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcvth_u16_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_u32_f16(__p0) __extension__ ({ \
|
||||||
|
uint32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint32_t) __builtin_neon_vcvth_u32_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_u64_f16(__p0) __extension__ ({ \
|
||||||
|
uint64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint64_t) __builtin_neon_vcvth_u64_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtah_s16_f16(__p0) __extension__ ({ \
|
||||||
|
int16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int16_t) __builtin_neon_vcvtah_s16_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtah_s32_f16(__p0) __extension__ ({ \
|
||||||
|
int32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int32_t) __builtin_neon_vcvtah_s32_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtah_s64_f16(__p0) __extension__ ({ \
|
||||||
|
int64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int64_t) __builtin_neon_vcvtah_s64_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtah_u16_f16(__p0) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcvtah_u16_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtah_u32_f16(__p0) __extension__ ({ \
|
||||||
|
uint32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint32_t) __builtin_neon_vcvtah_u32_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtah_u64_f16(__p0) __extension__ ({ \
|
||||||
|
uint64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint64_t) __builtin_neon_vcvtah_u64_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_f16_u16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
uint16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_f16_u16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_f16_s16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
int16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_f16_s16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_f16_u32(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
uint32_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_f16_u32(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_f16_s32(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
int32_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_f16_s32(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_f16_u64(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
uint64_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_f16_u64(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_f16_s64(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
int64_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_f16_s64(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
uint32_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_n_f16_u32(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
int32_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_n_f16_s32(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
uint64_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_n_f16_u64(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
int64_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_n_f16_s64(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
uint16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_n_f16_u16(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
int16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vcvth_n_f16_s16(__s0, __p1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtmh_s16_f16(__p0) __extension__ ({ \
|
||||||
|
int16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int16_t) __builtin_neon_vcvtmh_s16_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtmh_s32_f16(__p0) __extension__ ({ \
|
||||||
|
int32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int32_t) __builtin_neon_vcvtmh_s32_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtmh_s64_f16(__p0) __extension__ ({ \
|
||||||
|
int64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int64_t) __builtin_neon_vcvtmh_s64_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtmh_u16_f16(__p0) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcvtmh_u16_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtmh_u32_f16(__p0) __extension__ ({ \
|
||||||
|
uint32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint32_t) __builtin_neon_vcvtmh_u32_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtmh_u64_f16(__p0) __extension__ ({ \
|
||||||
|
uint64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint64_t) __builtin_neon_vcvtmh_u64_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtnh_s16_f16(__p0) __extension__ ({ \
|
||||||
|
int16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int16_t) __builtin_neon_vcvtnh_s16_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtnh_s32_f16(__p0) __extension__ ({ \
|
||||||
|
int32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int32_t) __builtin_neon_vcvtnh_s32_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtnh_s64_f16(__p0) __extension__ ({ \
|
||||||
|
int64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int64_t) __builtin_neon_vcvtnh_s64_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtnh_u16_f16(__p0) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcvtnh_u16_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtnh_u32_f16(__p0) __extension__ ({ \
|
||||||
|
uint32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint32_t) __builtin_neon_vcvtnh_u32_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtnh_u64_f16(__p0) __extension__ ({ \
|
||||||
|
uint64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint64_t) __builtin_neon_vcvtnh_u64_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtph_s16_f16(__p0) __extension__ ({ \
|
||||||
|
int16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int16_t) __builtin_neon_vcvtph_s16_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtph_s32_f16(__p0) __extension__ ({ \
|
||||||
|
int32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int32_t) __builtin_neon_vcvtph_s32_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtph_s64_f16(__p0) __extension__ ({ \
|
||||||
|
int64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (int64_t) __builtin_neon_vcvtph_s64_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtph_u16_f16(__p0) __extension__ ({ \
|
||||||
|
uint16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint16_t) __builtin_neon_vcvtph_u16_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtph_u32_f16(__p0) __extension__ ({ \
|
||||||
|
uint32_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint32_t) __builtin_neon_vcvtph_u32_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vcvtph_u64_f16(__p0) __extension__ ({ \
|
||||||
|
uint64_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (uint64_t) __builtin_neon_vcvtph_u64_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vdivh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vdivh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
float16_t __s2 = __p2; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vfmah_f16(__s0, __s1, __s2); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
float16_t __s2 = __p2; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vfmsh_f16(__s0, __s1, __s2); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vmaxh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vmaxh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vmaxnmh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vmaxnmh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vminh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vminh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vminnmh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vminnmh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vmulh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vmulh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vmulxh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vmulxh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vnegh_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vnegh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrecpeh_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrecpeh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrecpsh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrecpsh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrecpxh_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrecpxh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrndh_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrndh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrndah_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrndah_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrndih_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrndih_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrndmh_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrndmh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrndnh_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrndnh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrndph_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrndph_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrndxh_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrndxh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrsqrteh_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrsqrteh_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vrsqrtsh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vsqrth_f16(__p0) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vsqrth_f16(__s0); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#define vsubh_f16(__p0, __p1) __extension__ ({ \
|
||||||
|
float16_t __ret; \
|
||||||
|
float16_t __s0 = __p0; \
|
||||||
|
float16_t __s1 = __p1; \
|
||||||
|
__ret = (float16_t) __builtin_neon_vsubh_f16(__s0, __s1); \
|
||||||
|
__ret; \
|
||||||
|
})
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#undef __ai
|
||||||
|
|
||||||
|
#endif /* __ARM_FP16_H */
|
19187
third_party/aarch64/clang/arm_mve.h
vendored
Normal file
19187
third_party/aarch64/clang/arm_mve.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
69638
third_party/aarch64/clang/arm_neon.h
vendored
Normal file
69638
third_party/aarch64/clang/arm_neon.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
182
third_party/aarch64/clang/arm_neon_sve_bridge.h
vendored
Normal file
182
third_party/aarch64/clang/arm_neon_sve_bridge.h
vendored
Normal file
|
@ -0,0 +1,182 @@
|
||||||
|
/*===---- arm_neon_sve_bridge.h - ARM NEON SVE Bridge intrinsics -----------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __ARM_NEON_SVE_BRIDGE_H
|
||||||
|
#define __ARM_NEON_SVE_BRIDGE_H
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#include <arm_sve.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Function attributes */
|
||||||
|
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
|
||||||
|
#define __aio \
|
||||||
|
static __inline__ \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __overloadable__))
|
||||||
|
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
|
||||||
|
svint8_t svset_neonq(svint8_t, int8x16_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
|
||||||
|
svint16_t svset_neonq(svint16_t, int16x8_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
|
||||||
|
svint32_t svset_neonq(svint32_t, int32x4_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
|
||||||
|
svint64_t svset_neonq(svint64_t, int64x2_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
|
||||||
|
svuint8_t svset_neonq(svuint8_t, uint8x16_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
|
||||||
|
svuint16_t svset_neonq(svuint16_t, uint16x8_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
|
||||||
|
svuint32_t svset_neonq(svuint32_t, uint32x4_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
|
||||||
|
svuint64_t svset_neonq(svuint64_t, uint64x2_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
|
||||||
|
svfloat16_t svset_neonq(svfloat16_t, float16x8_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
|
||||||
|
svfloat32_t svset_neonq(svfloat32_t, float32x4_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
|
||||||
|
svfloat64_t svset_neonq(svfloat64_t, float64x2_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
|
||||||
|
svint8_t svset_neonq_s8(svint8_t, int8x16_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
|
||||||
|
svint16_t svset_neonq_s16(svint16_t, int16x8_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
|
||||||
|
svint32_t svset_neonq_s32(svint32_t, int32x4_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
|
||||||
|
svint64_t svset_neonq_s64(svint64_t, int64x2_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
|
||||||
|
svuint8_t svset_neonq_u8(svuint8_t, uint8x16_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
|
||||||
|
svuint16_t svset_neonq_u16(svuint16_t, uint16x8_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
|
||||||
|
svuint32_t svset_neonq_u32(svuint32_t, uint32x4_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
|
||||||
|
svuint64_t svset_neonq_u64(svuint64_t, uint64x2_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
|
||||||
|
svfloat16_t svset_neonq_f16(svfloat16_t, float16x8_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
|
||||||
|
svfloat32_t svset_neonq_f32(svfloat32_t, float32x4_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
|
||||||
|
svfloat64_t svset_neonq_f64(svfloat64_t, float64x2_t);
|
||||||
|
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
|
||||||
|
int8x16_t svget_neonq(svint8_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
|
||||||
|
int16x8_t svget_neonq(svint16_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
|
||||||
|
int32x4_t svget_neonq(svint32_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
|
||||||
|
int64x2_t svget_neonq(svint64_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
|
||||||
|
uint8x16_t svget_neonq(svuint8_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
|
||||||
|
uint16x8_t svget_neonq(svuint16_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
|
||||||
|
uint32x4_t svget_neonq(svuint32_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
|
||||||
|
uint64x2_t svget_neonq(svuint64_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
|
||||||
|
float16x8_t svget_neonq(svfloat16_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
|
||||||
|
float32x4_t svget_neonq(svfloat32_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
|
||||||
|
float64x2_t svget_neonq(svfloat64_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
|
||||||
|
int8x16_t svget_neonq_s8(svint8_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
|
||||||
|
int16x8_t svget_neonq_s16(svint16_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
|
||||||
|
int32x4_t svget_neonq_s32(svint32_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
|
||||||
|
int64x2_t svget_neonq_s64(svint64_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
|
||||||
|
uint8x16_t svget_neonq_u8(svuint8_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
|
||||||
|
uint16x8_t svget_neonq_u16(svuint16_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
|
||||||
|
uint32x4_t svget_neonq_u32(svuint32_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
|
||||||
|
uint64x2_t svget_neonq_u64(svuint64_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
|
||||||
|
float16x8_t svget_neonq_f16(svfloat16_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
|
||||||
|
float32x4_t svget_neonq_f32(svfloat32_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
|
||||||
|
float64x2_t svget_neonq_f64(svfloat64_t);
|
||||||
|
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
|
||||||
|
svint8_t svdup_neonq(int8x16_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
|
||||||
|
svint16_t svdup_neonq(int16x8_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
|
||||||
|
svint32_t svdup_neonq(int32x4_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
|
||||||
|
svint64_t svdup_neonq(int64x2_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
|
||||||
|
svuint8_t svdup_neonq(uint8x16_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
|
||||||
|
svuint16_t svdup_neonq(uint16x8_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
|
||||||
|
svuint32_t svdup_neonq(uint32x4_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
|
||||||
|
svuint64_t svdup_neonq(uint64x2_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
|
||||||
|
svfloat16_t svdup_neonq(float16x8_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
|
||||||
|
svfloat32_t svdup_neonq(float32x4_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
|
||||||
|
svfloat64_t svdup_neonq(float64x2_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
|
||||||
|
svint8_t svdup_neonq_s8(int8x16_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
|
||||||
|
svint16_t svdup_neonq_s16(int16x8_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
|
||||||
|
svint32_t svdup_neonq_s32(int32x4_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
|
||||||
|
svint64_t svdup_neonq_s64(int64x2_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
|
||||||
|
svuint8_t svdup_neonq_u8(uint8x16_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
|
||||||
|
svuint16_t svdup_neonq_u16(uint16x8_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
|
||||||
|
svuint32_t svdup_neonq_u32(uint32x4_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
|
||||||
|
svuint64_t svdup_neonq_u64(uint64x2_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
|
||||||
|
svfloat16_t svdup_neonq_f16(float16x8_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
|
||||||
|
svfloat32_t svdup_neonq_f32(float32x4_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
|
||||||
|
svfloat64_t svdup_neonq_f64(float64x2_t);
|
||||||
|
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
|
||||||
|
svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
|
||||||
|
svbfloat16_t svset_neonq_bf16(svbfloat16_t, bfloat16x8_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
|
||||||
|
bfloat16x8_t svget_neonq(svbfloat16_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
|
||||||
|
bfloat16x8_t svget_neonq_bf16(svbfloat16_t);
|
||||||
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
|
||||||
|
svbfloat16_t svdup_neonq(bfloat16x8_t);
|
||||||
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
|
||||||
|
svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
|
||||||
|
|
||||||
|
#undef __ai
|
||||||
|
#undef __aio
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} // extern "C"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif //__ARM_NEON_SVE_BRIDGE_H
|
2819
third_party/aarch64/clang/arm_sme.h
vendored
Normal file
2819
third_party/aarch64/clang/arm_sme.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
30537
third_party/aarch64/clang/arm_sve.h
vendored
Normal file
30537
third_party/aarch64/clang/arm_sve.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
345
third_party/aarch64/clang/arm_vector_types.h
vendored
Normal file
345
third_party/aarch64/clang/arm_vector_types.h
vendored
Normal file
|
@ -0,0 +1,345 @@
|
||||||
|
/*===---- arm_vector_types - ARM vector type ------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if !defined(__ARM_NEON_H) && !defined(__ARM_SVE_H)
|
||||||
|
#error "This file should not be used standalone. Please include arm_neon.h or arm_sve.h instead"
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#ifndef __ARM_NEON_TYPES_H
|
||||||
|
#define __ARM_NEON_TYPES_H
|
||||||
|
typedef float float32_t;
|
||||||
|
typedef __fp16 float16_t;
|
||||||
|
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||||
|
typedef double float64_t;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
|
||||||
|
typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
|
||||||
|
typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
|
||||||
|
typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
|
||||||
|
typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
|
||||||
|
typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
|
||||||
|
typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
|
||||||
|
typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
|
||||||
|
typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
|
||||||
|
typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
|
||||||
|
typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
|
||||||
|
typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
|
||||||
|
typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
|
||||||
|
typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
|
||||||
|
typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
|
||||||
|
typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
|
||||||
|
typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
|
||||||
|
typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
|
||||||
|
typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
|
||||||
|
typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
|
||||||
|
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||||
|
typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
|
||||||
|
typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef struct int8x8x2_t {
|
||||||
|
int8x8_t val[2];
|
||||||
|
} int8x8x2_t;
|
||||||
|
|
||||||
|
typedef struct int8x16x2_t {
|
||||||
|
int8x16_t val[2];
|
||||||
|
} int8x16x2_t;
|
||||||
|
|
||||||
|
typedef struct int16x4x2_t {
|
||||||
|
int16x4_t val[2];
|
||||||
|
} int16x4x2_t;
|
||||||
|
|
||||||
|
typedef struct int16x8x2_t {
|
||||||
|
int16x8_t val[2];
|
||||||
|
} int16x8x2_t;
|
||||||
|
|
||||||
|
typedef struct int32x2x2_t {
|
||||||
|
int32x2_t val[2];
|
||||||
|
} int32x2x2_t;
|
||||||
|
|
||||||
|
typedef struct int32x4x2_t {
|
||||||
|
int32x4_t val[2];
|
||||||
|
} int32x4x2_t;
|
||||||
|
|
||||||
|
typedef struct int64x1x2_t {
|
||||||
|
int64x1_t val[2];
|
||||||
|
} int64x1x2_t;
|
||||||
|
|
||||||
|
typedef struct int64x2x2_t {
|
||||||
|
int64x2_t val[2];
|
||||||
|
} int64x2x2_t;
|
||||||
|
|
||||||
|
typedef struct uint8x8x2_t {
|
||||||
|
uint8x8_t val[2];
|
||||||
|
} uint8x8x2_t;
|
||||||
|
|
||||||
|
typedef struct uint8x16x2_t {
|
||||||
|
uint8x16_t val[2];
|
||||||
|
} uint8x16x2_t;
|
||||||
|
|
||||||
|
typedef struct uint16x4x2_t {
|
||||||
|
uint16x4_t val[2];
|
||||||
|
} uint16x4x2_t;
|
||||||
|
|
||||||
|
typedef struct uint16x8x2_t {
|
||||||
|
uint16x8_t val[2];
|
||||||
|
} uint16x8x2_t;
|
||||||
|
|
||||||
|
typedef struct uint32x2x2_t {
|
||||||
|
uint32x2_t val[2];
|
||||||
|
} uint32x2x2_t;
|
||||||
|
|
||||||
|
typedef struct uint32x4x2_t {
|
||||||
|
uint32x4_t val[2];
|
||||||
|
} uint32x4x2_t;
|
||||||
|
|
||||||
|
typedef struct uint64x1x2_t {
|
||||||
|
uint64x1_t val[2];
|
||||||
|
} uint64x1x2_t;
|
||||||
|
|
||||||
|
typedef struct uint64x2x2_t {
|
||||||
|
uint64x2_t val[2];
|
||||||
|
} uint64x2x2_t;
|
||||||
|
|
||||||
|
typedef struct float16x4x2_t {
|
||||||
|
float16x4_t val[2];
|
||||||
|
} float16x4x2_t;
|
||||||
|
|
||||||
|
typedef struct float16x8x2_t {
|
||||||
|
float16x8_t val[2];
|
||||||
|
} float16x8x2_t;
|
||||||
|
|
||||||
|
typedef struct float32x2x2_t {
|
||||||
|
float32x2_t val[2];
|
||||||
|
} float32x2x2_t;
|
||||||
|
|
||||||
|
typedef struct float32x4x2_t {
|
||||||
|
float32x4_t val[2];
|
||||||
|
} float32x4x2_t;
|
||||||
|
|
||||||
|
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||||
|
typedef struct float64x1x2_t {
|
||||||
|
float64x1_t val[2];
|
||||||
|
} float64x1x2_t;
|
||||||
|
|
||||||
|
typedef struct float64x2x2_t {
|
||||||
|
float64x2_t val[2];
|
||||||
|
} float64x2x2_t;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
typedef struct int8x8x3_t {
|
||||||
|
int8x8_t val[3];
|
||||||
|
} int8x8x3_t;
|
||||||
|
|
||||||
|
typedef struct int8x16x3_t {
|
||||||
|
int8x16_t val[3];
|
||||||
|
} int8x16x3_t;
|
||||||
|
|
||||||
|
typedef struct int16x4x3_t {
|
||||||
|
int16x4_t val[3];
|
||||||
|
} int16x4x3_t;
|
||||||
|
|
||||||
|
typedef struct int16x8x3_t {
|
||||||
|
int16x8_t val[3];
|
||||||
|
} int16x8x3_t;
|
||||||
|
|
||||||
|
typedef struct int32x2x3_t {
|
||||||
|
int32x2_t val[3];
|
||||||
|
} int32x2x3_t;
|
||||||
|
|
||||||
|
typedef struct int32x4x3_t {
|
||||||
|
int32x4_t val[3];
|
||||||
|
} int32x4x3_t;
|
||||||
|
|
||||||
|
typedef struct int64x1x3_t {
|
||||||
|
int64x1_t val[3];
|
||||||
|
} int64x1x3_t;
|
||||||
|
|
||||||
|
typedef struct int64x2x3_t {
|
||||||
|
int64x2_t val[3];
|
||||||
|
} int64x2x3_t;
|
||||||
|
|
||||||
|
typedef struct uint8x8x3_t {
|
||||||
|
uint8x8_t val[3];
|
||||||
|
} uint8x8x3_t;
|
||||||
|
|
||||||
|
typedef struct uint8x16x3_t {
|
||||||
|
uint8x16_t val[3];
|
||||||
|
} uint8x16x3_t;
|
||||||
|
|
||||||
|
typedef struct uint16x4x3_t {
|
||||||
|
uint16x4_t val[3];
|
||||||
|
} uint16x4x3_t;
|
||||||
|
|
||||||
|
typedef struct uint16x8x3_t {
|
||||||
|
uint16x8_t val[3];
|
||||||
|
} uint16x8x3_t;
|
||||||
|
|
||||||
|
typedef struct uint32x2x3_t {
|
||||||
|
uint32x2_t val[3];
|
||||||
|
} uint32x2x3_t;
|
||||||
|
|
||||||
|
typedef struct uint32x4x3_t {
|
||||||
|
uint32x4_t val[3];
|
||||||
|
} uint32x4x3_t;
|
||||||
|
|
||||||
|
typedef struct uint64x1x3_t {
|
||||||
|
uint64x1_t val[3];
|
||||||
|
} uint64x1x3_t;
|
||||||
|
|
||||||
|
typedef struct uint64x2x3_t {
|
||||||
|
uint64x2_t val[3];
|
||||||
|
} uint64x2x3_t;
|
||||||
|
|
||||||
|
typedef struct float16x4x3_t {
|
||||||
|
float16x4_t val[3];
|
||||||
|
} float16x4x3_t;
|
||||||
|
|
||||||
|
typedef struct float16x8x3_t {
|
||||||
|
float16x8_t val[3];
|
||||||
|
} float16x8x3_t;
|
||||||
|
|
||||||
|
typedef struct float32x2x3_t {
|
||||||
|
float32x2_t val[3];
|
||||||
|
} float32x2x3_t;
|
||||||
|
|
||||||
|
typedef struct float32x4x3_t {
|
||||||
|
float32x4_t val[3];
|
||||||
|
} float32x4x3_t;
|
||||||
|
|
||||||
|
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||||
|
typedef struct float64x1x3_t {
|
||||||
|
float64x1_t val[3];
|
||||||
|
} float64x1x3_t;
|
||||||
|
|
||||||
|
typedef struct float64x2x3_t {
|
||||||
|
float64x2_t val[3];
|
||||||
|
} float64x2x3_t;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
typedef struct int8x8x4_t {
|
||||||
|
int8x8_t val[4];
|
||||||
|
} int8x8x4_t;
|
||||||
|
|
||||||
|
typedef struct int8x16x4_t {
|
||||||
|
int8x16_t val[4];
|
||||||
|
} int8x16x4_t;
|
||||||
|
|
||||||
|
typedef struct int16x4x4_t {
|
||||||
|
int16x4_t val[4];
|
||||||
|
} int16x4x4_t;
|
||||||
|
|
||||||
|
typedef struct int16x8x4_t {
|
||||||
|
int16x8_t val[4];
|
||||||
|
} int16x8x4_t;
|
||||||
|
|
||||||
|
typedef struct int32x2x4_t {
|
||||||
|
int32x2_t val[4];
|
||||||
|
} int32x2x4_t;
|
||||||
|
|
||||||
|
typedef struct int32x4x4_t {
|
||||||
|
int32x4_t val[4];
|
||||||
|
} int32x4x4_t;
|
||||||
|
|
||||||
|
typedef struct int64x1x4_t {
|
||||||
|
int64x1_t val[4];
|
||||||
|
} int64x1x4_t;
|
||||||
|
|
||||||
|
typedef struct int64x2x4_t {
|
||||||
|
int64x2_t val[4];
|
||||||
|
} int64x2x4_t;
|
||||||
|
|
||||||
|
typedef struct uint8x8x4_t {
|
||||||
|
uint8x8_t val[4];
|
||||||
|
} uint8x8x4_t;
|
||||||
|
|
||||||
|
typedef struct uint8x16x4_t {
|
||||||
|
uint8x16_t val[4];
|
||||||
|
} uint8x16x4_t;
|
||||||
|
|
||||||
|
typedef struct uint16x4x4_t {
|
||||||
|
uint16x4_t val[4];
|
||||||
|
} uint16x4x4_t;
|
||||||
|
|
||||||
|
typedef struct uint16x8x4_t {
|
||||||
|
uint16x8_t val[4];
|
||||||
|
} uint16x8x4_t;
|
||||||
|
|
||||||
|
typedef struct uint32x2x4_t {
|
||||||
|
uint32x2_t val[4];
|
||||||
|
} uint32x2x4_t;
|
||||||
|
|
||||||
|
typedef struct uint32x4x4_t {
|
||||||
|
uint32x4_t val[4];
|
||||||
|
} uint32x4x4_t;
|
||||||
|
|
||||||
|
typedef struct uint64x1x4_t {
|
||||||
|
uint64x1_t val[4];
|
||||||
|
} uint64x1x4_t;
|
||||||
|
|
||||||
|
typedef struct uint64x2x4_t {
|
||||||
|
uint64x2_t val[4];
|
||||||
|
} uint64x2x4_t;
|
||||||
|
|
||||||
|
typedef struct float16x4x4_t {
|
||||||
|
float16x4_t val[4];
|
||||||
|
} float16x4x4_t;
|
||||||
|
|
||||||
|
typedef struct float16x8x4_t {
|
||||||
|
float16x8_t val[4];
|
||||||
|
} float16x8x4_t;
|
||||||
|
|
||||||
|
typedef struct float32x2x4_t {
|
||||||
|
float32x2_t val[4];
|
||||||
|
} float32x2x4_t;
|
||||||
|
|
||||||
|
typedef struct float32x4x4_t {
|
||||||
|
float32x4_t val[4];
|
||||||
|
} float32x4x4_t;
|
||||||
|
|
||||||
|
#if defined(__aarch64__) || defined(__arm64ec__)
|
||||||
|
typedef struct float64x1x4_t {
|
||||||
|
float64x1_t val[4];
|
||||||
|
} float64x1x4_t;
|
||||||
|
|
||||||
|
typedef struct float64x2x4_t {
|
||||||
|
float64x2_t val[4];
|
||||||
|
} float64x2x4_t;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
typedef __attribute__((neon_vector_type(4))) bfloat16_t bfloat16x4_t;
|
||||||
|
typedef __attribute__((neon_vector_type(8))) bfloat16_t bfloat16x8_t;
|
||||||
|
|
||||||
|
typedef struct bfloat16x4x2_t {
|
||||||
|
bfloat16x4_t val[2];
|
||||||
|
} bfloat16x4x2_t;
|
||||||
|
|
||||||
|
typedef struct bfloat16x8x2_t {
|
||||||
|
bfloat16x8_t val[2];
|
||||||
|
} bfloat16x8x2_t;
|
||||||
|
|
||||||
|
typedef struct bfloat16x4x3_t {
|
||||||
|
bfloat16x4_t val[3];
|
||||||
|
} bfloat16x4x3_t;
|
||||||
|
|
||||||
|
typedef struct bfloat16x8x3_t {
|
||||||
|
bfloat16x8_t val[3];
|
||||||
|
} bfloat16x8x3_t;
|
||||||
|
|
||||||
|
typedef struct bfloat16x4x4_t {
|
||||||
|
bfloat16x4_t val[4];
|
||||||
|
} bfloat16x4x4_t;
|
||||||
|
|
||||||
|
typedef struct bfloat16x8x4_t {
|
||||||
|
bfloat16x8_t val[4];
|
||||||
|
} bfloat16x8x4_t;
|
||||||
|
|
||||||
|
#endif // __ARM_NEON_TYPES_H
|
31
third_party/aarch64/clang/armintr.h
vendored
Normal file
31
third_party/aarch64/clang/armintr.h
vendored
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Only include this if we're compiling for the windows platform. */
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
#include_next <armintr.h>
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifndef __ARMINTR_H
|
||||||
|
#define __ARMINTR_H
|
||||||
|
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
_ARM_BARRIER_SY = 0xF,
|
||||||
|
_ARM_BARRIER_ST = 0xE,
|
||||||
|
_ARM_BARRIER_ISH = 0xB,
|
||||||
|
_ARM_BARRIER_ISHST = 0xA,
|
||||||
|
_ARM_BARRIER_NSH = 0x7,
|
||||||
|
_ARM_BARRIER_NSHST = 0x6,
|
||||||
|
_ARM_BARRIER_OSH = 0x3,
|
||||||
|
_ARM_BARRIER_OSHST = 0x2
|
||||||
|
} _ARMINTR_BARRIER_TYPE;
|
||||||
|
|
||||||
|
#endif /* __ARMINTR_H */
|
||||||
|
#endif /* _MSC_VER */
|
2
third_party/awk/run.c
vendored
2
third_party/awk/run.c
vendored
|
@ -495,7 +495,7 @@ makearraystring(Node *p, const char *func)
|
||||||
|
|
||||||
if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
|
if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
|
||||||
FATAL("%s: out of memory %s[%s...]",
|
FATAL("%s: out of memory %s[%s...]",
|
||||||
func, x->nval, buf);
|
func ? func : "NULL", x->nval, buf);
|
||||||
}
|
}
|
||||||
memcpy(buf + blen, s, slen);
|
memcpy(buf + blen, s, slen);
|
||||||
if (nsub) {
|
if (nsub) {
|
||||||
|
|
3
third_party/double-conversion/BUILD.mk
vendored
3
third_party/double-conversion/BUILD.mk
vendored
|
@ -34,7 +34,8 @@ THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS = \
|
||||||
LIBC_MEM \
|
LIBC_MEM \
|
||||||
LIBC_STR \
|
LIBC_STR \
|
||||||
LIBC_TINYMATH \
|
LIBC_TINYMATH \
|
||||||
THIRD_PARTY_LIBCXXABI
|
THIRD_PARTY_LIBCXXABI \
|
||||||
|
THIRD_PARTY_LIBUNWIND
|
||||||
|
|
||||||
THIRD_PARTY_DOUBLECONVERSION_A_DEPS := \
|
THIRD_PARTY_DOUBLECONVERSION_A_DEPS := \
|
||||||
$(call uniq,$(foreach x,$(THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS),$($(x))))
|
$(call uniq,$(foreach x,$(THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS),$($(x))))
|
||||||
|
|
2
third_party/intel/BUILD.mk
vendored
2
third_party/intel/BUILD.mk
vendored
|
@ -3,4 +3,4 @@
|
||||||
|
|
||||||
PKGS += THIRD_PARTY_INTEL
|
PKGS += THIRD_PARTY_INTEL
|
||||||
THIRD_PARTY_INTEL_HDRS = $(filter %.h,$(THIRD_PARTY_INTEL_FILES))
|
THIRD_PARTY_INTEL_HDRS = $(filter %.h,$(THIRD_PARTY_INTEL_FILES))
|
||||||
THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*)
|
THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*) $(wildcard third_party/intel/clang/*)
|
||||||
|
|
140
third_party/intel/clang/__wmmintrin_aes.h
vendored
Normal file
140
third_party/intel/clang/__wmmintrin_aes.h
vendored
Normal file
|
@ -0,0 +1,140 @@
|
||||||
|
/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __WMMINTRIN_H
|
||||||
|
#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __WMMINTRIN_AES_H
|
||||||
|
#define __WMMINTRIN_AES_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
|
||||||
|
|
||||||
|
/// Performs a single round of AES encryption using the Equivalent
|
||||||
|
/// Inverse Cipher, transforming the state value from the first source
|
||||||
|
/// operand using a 128-bit round key value contained in the second source
|
||||||
|
/// operand, and writes the result to the destination.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __V
|
||||||
|
/// A 128-bit integer vector containing the state value.
|
||||||
|
/// \param __R
|
||||||
|
/// A 128-bit integer vector containing the round key value.
|
||||||
|
/// \returns A 128-bit integer vector containing the encrypted value.
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
|
_mm_aesenc_si128(__m128i __V, __m128i __R)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Performs the final round of AES encryption using the Equivalent
|
||||||
|
/// Inverse Cipher, transforming the state value from the first source
|
||||||
|
/// operand using a 128-bit round key value contained in the second source
|
||||||
|
/// operand, and writes the result to the destination.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __V
|
||||||
|
/// A 128-bit integer vector containing the state value.
|
||||||
|
/// \param __R
|
||||||
|
/// A 128-bit integer vector containing the round key value.
|
||||||
|
/// \returns A 128-bit integer vector containing the encrypted value.
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
|
_mm_aesenclast_si128(__m128i __V, __m128i __R)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Performs a single round of AES decryption using the Equivalent
|
||||||
|
/// Inverse Cipher, transforming the state value from the first source
|
||||||
|
/// operand using a 128-bit round key value contained in the second source
|
||||||
|
/// operand, and writes the result to the destination.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __V
|
||||||
|
/// A 128-bit integer vector containing the state value.
|
||||||
|
/// \param __R
|
||||||
|
/// A 128-bit integer vector containing the round key value.
|
||||||
|
/// \returns A 128-bit integer vector containing the decrypted value.
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
|
_mm_aesdec_si128(__m128i __V, __m128i __R)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Performs the final round of AES decryption using the Equivalent
|
||||||
|
/// Inverse Cipher, transforming the state value from the first source
|
||||||
|
/// operand using a 128-bit round key value contained in the second source
|
||||||
|
/// operand, and writes the result to the destination.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __V
|
||||||
|
/// A 128-bit integer vector containing the state value.
|
||||||
|
/// \param __R
|
||||||
|
/// A 128-bit integer vector containing the round key value.
|
||||||
|
/// \returns A 128-bit integer vector containing the decrypted value.
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
|
_mm_aesdeclast_si128(__m128i __V, __m128i __R)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Applies the AES InvMixColumns() transformation to an expanded key
|
||||||
|
/// contained in the source operand, and writes the result to the
|
||||||
|
/// destination.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __V
|
||||||
|
/// A 128-bit integer vector containing the expanded key.
|
||||||
|
/// \returns A 128-bit integer vector containing the transformed value.
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
|
_mm_aesimc_si128(__m128i __V)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generates a round key for AES encryption, operating on 128-bit data
|
||||||
|
/// specified in the first source operand and using an 8-bit round constant
|
||||||
|
/// specified by the second source operand, and writes the result to the
|
||||||
|
/// destination.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \code
|
||||||
|
/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param C
|
||||||
|
/// A 128-bit integer vector that is used to generate the AES encryption key.
|
||||||
|
/// \param R
|
||||||
|
/// An 8-bit round constant used to generate the AES encryption key.
|
||||||
|
/// \returns A 128-bit round key for AES encryption.
|
||||||
|
#define _mm_aeskeygenassist_si128(C, R) \
|
||||||
|
((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif /* __WMMINTRIN_AES_H */
|
48
third_party/intel/clang/__wmmintrin_pclmul.h
vendored
Normal file
48
third_party/intel/clang/__wmmintrin_pclmul.h
vendored
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __WMMINTRIN_H
|
||||||
|
#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __WMMINTRIN_PCLMUL_H
|
||||||
|
#define __WMMINTRIN_PCLMUL_H
|
||||||
|
|
||||||
|
/// Multiplies two 64-bit integer values, which are selected from source
|
||||||
|
/// operands using the immediate-value operand. The multiplication is a
|
||||||
|
/// carry-less multiplication, and the 128-bit integer product is stored in
|
||||||
|
/// the destination.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \code
|
||||||
|
/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param X
|
||||||
|
/// A 128-bit vector of [2 x i64] containing one of the source operands.
|
||||||
|
/// \param Y
|
||||||
|
/// A 128-bit vector of [2 x i64] containing one of the source operands.
|
||||||
|
/// \param I
|
||||||
|
/// An immediate value specifying which 64-bit values to select from the
|
||||||
|
/// operands. Bit 0 is used to select a value from operand \a X, and bit
|
||||||
|
/// 4 is used to select a value from operand \a Y: \n
|
||||||
|
/// Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
|
||||||
|
/// Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
|
||||||
|
/// Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
|
||||||
|
/// Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
|
||||||
|
/// \returns The 128-bit integer vector containing the result of the carry-less
|
||||||
|
/// multiplication of the selected 64-bit values.
|
||||||
|
#define _mm_clmulepi64_si128(X, Y, I) \
|
||||||
|
((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
|
||||||
|
(__v2di)(__m128i)(Y), (char)(I)))
|
||||||
|
|
||||||
|
#endif /* __WMMINTRIN_PCLMUL_H */
|
160
third_party/intel/clang/adcintrin.h
vendored
Normal file
160
third_party/intel/clang/adcintrin.h
vendored
Normal file
|
@ -0,0 +1,160 @@
|
||||||
|
/*===---- adcintrin.h - ADC intrinsics -------------------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __ADCINTRIN_H
|
||||||
|
#define __ADCINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||||
|
|
||||||
|
/* Use C++ inline semantics in C++, GNU inline for C mode. */
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
#define __INLINE __inline
|
||||||
|
#else
|
||||||
|
#define __INLINE static __inline
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
|
||||||
|
/// by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
|
||||||
|
/// at \a __p, and returns the 8-bit carry-out (carry flag).
|
||||||
|
///
|
||||||
|
/// \code{.operation}
|
||||||
|
/// temp := (__cf == 0) ? 0 : 1
|
||||||
|
/// Store32(__p, __x + __y + temp)
|
||||||
|
/// result := CF
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the \c ADC instruction.
|
||||||
|
///
|
||||||
|
/// \param __cf
|
||||||
|
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
||||||
|
/// \param __x
|
||||||
|
/// A 32-bit unsigned addend.
|
||||||
|
/// \param __y
|
||||||
|
/// A 32-bit unsigned addend.
|
||||||
|
/// \param __p
|
||||||
|
/// Pointer to memory for storing the sum.
|
||||||
|
/// \returns The 8-bit unsigned carry-out value.
|
||||||
|
__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
|
||||||
|
unsigned int __x,
|
||||||
|
unsigned int __y,
|
||||||
|
unsigned int *__p) {
|
||||||
|
return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
|
||||||
|
/// flag \a __cf, and subtracts the result from unsigned 32-bit integer
|
||||||
|
/// \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
|
||||||
|
/// and returns the 8-bit carry-out (carry or overflow flag).
|
||||||
|
///
|
||||||
|
/// \code{.operation}
|
||||||
|
/// temp := (__cf == 0) ? 0 : 1
|
||||||
|
/// Store32(__p, __x - (__y + temp))
|
||||||
|
/// result := CF
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the \c SBB instruction.
|
||||||
|
///
|
||||||
|
/// \param __cf
|
||||||
|
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
||||||
|
/// \param __x
|
||||||
|
/// The 32-bit unsigned minuend.
|
||||||
|
/// \param __y
|
||||||
|
/// The 32-bit unsigned subtrahend.
|
||||||
|
/// \param __p
|
||||||
|
/// Pointer to memory for storing the difference.
|
||||||
|
/// \returns The 8-bit unsigned carry-out value.
|
||||||
|
__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
|
||||||
|
unsigned int __x,
|
||||||
|
unsigned int __y,
|
||||||
|
unsigned int *__p) {
|
||||||
|
return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __x86_64__
|
||||||
|
/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
|
||||||
|
/// by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
|
||||||
|
/// at \a __p, and returns the 8-bit carry-out (carry flag).
|
||||||
|
///
|
||||||
|
/// \code{.operation}
|
||||||
|
/// temp := (__cf == 0) ? 0 : 1
|
||||||
|
/// Store64(__p, __x + __y + temp)
|
||||||
|
/// result := CF
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the \c ADC instruction.
|
||||||
|
///
|
||||||
|
/// \param __cf
|
||||||
|
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
||||||
|
/// \param __x
|
||||||
|
/// A 64-bit unsigned addend.
|
||||||
|
/// \param __y
|
||||||
|
/// A 64-bit unsigned addend.
|
||||||
|
/// \param __p
|
||||||
|
/// Pointer to memory for storing the sum.
|
||||||
|
/// \returns The 8-bit unsigned carry-out value.
|
||||||
|
__INLINE unsigned char __DEFAULT_FN_ATTRS
|
||||||
|
_addcarry_u64(unsigned char __cf, unsigned long long __x,
|
||||||
|
unsigned long long __y, unsigned long long *__p) {
|
||||||
|
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
|
||||||
|
/// flag \a __cf, and subtracts the result from unsigned 64-bit integer
|
||||||
|
/// \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
|
||||||
|
/// and returns the 8-bit carry-out (carry or overflow flag).
|
||||||
|
///
|
||||||
|
/// \code{.operation}
|
||||||
|
/// temp := (__cf == 0) ? 0 : 1
|
||||||
|
/// Store64(__p, __x - (__y + temp))
|
||||||
|
/// result := CF
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the \c ADC instruction.
|
||||||
|
///
|
||||||
|
/// \param __cf
|
||||||
|
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
||||||
|
/// \param __x
|
||||||
|
/// The 64-bit unsigned minuend.
|
||||||
|
/// \param __y
|
||||||
|
/// The 64-bit unsigned subtrahend.
|
||||||
|
/// \param __p
|
||||||
|
/// Pointer to memory for storing the difference.
|
||||||
|
/// \returns The 8-bit unsigned carry-out value.
|
||||||
|
__INLINE unsigned char __DEFAULT_FN_ATTRS
|
||||||
|
_subborrow_u64(unsigned char __cf, unsigned long long __x,
|
||||||
|
unsigned long long __y, unsigned long long *__p) {
|
||||||
|
return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#undef __INLINE
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif /* __ADCINTRIN_H */
|
102
third_party/intel/clang/adxintrin.h
vendored
Normal file
102
third_party/intel/clang/adxintrin.h
vendored
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __ADXINTRIN_H
|
||||||
|
#define __ADXINTRIN_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("adx")))
|
||||||
|
|
||||||
|
/* Use C++ inline semantics in C++, GNU inline for C mode. */
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
#define __INLINE __inline
|
||||||
|
#else
|
||||||
|
#define __INLINE static __inline
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Intrinsics that are available only if __ADX__ is defined. */
|
||||||
|
|
||||||
|
/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
|
||||||
|
/// by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
|
||||||
|
/// at \a __p, and returns the 8-bit carry-out (carry flag).
|
||||||
|
///
|
||||||
|
/// \code{.operation}
|
||||||
|
/// temp := (__cf == 0) ? 0 : 1
|
||||||
|
/// Store32(__p, __x + __y + temp)
|
||||||
|
/// result := CF
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the \c ADCX instruction.
|
||||||
|
///
|
||||||
|
/// \param __cf
|
||||||
|
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
||||||
|
/// \param __x
|
||||||
|
/// A 32-bit unsigned addend.
|
||||||
|
/// \param __y
|
||||||
|
/// A 32-bit unsigned addend.
|
||||||
|
/// \param __p
|
||||||
|
/// Pointer to memory for storing the sum.
|
||||||
|
/// \returns The 8-bit unsigned carry-out value.
|
||||||
|
__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
|
||||||
|
unsigned int __x,
|
||||||
|
unsigned int __y,
|
||||||
|
unsigned int *__p) {
|
||||||
|
return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __x86_64__
|
||||||
|
/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
|
||||||
|
/// by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
|
||||||
|
/// at \a __p, and returns the 8-bit carry-out (carry flag).
|
||||||
|
///
|
||||||
|
/// \code{.operation}
|
||||||
|
/// temp := (__cf == 0) ? 0 : 1
|
||||||
|
/// Store64(__p, __x + __y + temp)
|
||||||
|
/// result := CF
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the \c ADCX instruction.
|
||||||
|
///
|
||||||
|
/// \param __cf
|
||||||
|
/// The 8-bit unsigned carry flag; any non-zero value indicates carry.
|
||||||
|
/// \param __x
|
||||||
|
/// A 64-bit unsigned addend.
|
||||||
|
/// \param __y
|
||||||
|
/// A 64-bit unsigned addend.
|
||||||
|
/// \param __p
|
||||||
|
/// Pointer to memory for storing the sum.
|
||||||
|
/// \returns The 8-bit unsigned carry-out value.
|
||||||
|
__INLINE unsigned char __DEFAULT_FN_ATTRS
|
||||||
|
_addcarryx_u64(unsigned char __cf, unsigned long long __x,
|
||||||
|
unsigned long long __y, unsigned long long *__p) {
|
||||||
|
return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__cplusplus)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#undef __INLINE
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif /* __ADXINTRIN_H */
|
183
third_party/intel/clang/ammintrin.h
vendored
Normal file
183
third_party/intel/clang/ammintrin.h
vendored
Normal file
|
@ -0,0 +1,183 @@
|
||||||
|
/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __AMMINTRIN_H
|
||||||
|
#define __AMMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "pmmintrin.h"
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
|
||||||
|
|
||||||
|
/// Extracts the specified bits from the lower 64 bits of the 128-bit
|
||||||
|
/// integer vector operand at the index \a idx and of the length \a len.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \code
|
||||||
|
/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param x
|
||||||
|
/// The value from which bits are extracted.
|
||||||
|
/// \param len
|
||||||
|
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
|
||||||
|
/// are zero, the length is interpreted as 64.
|
||||||
|
/// \param idx
|
||||||
|
/// Bits [5:0] specify the index of the least significant bit; the other
|
||||||
|
/// bits are ignored. If the sum of the index and length is greater than 64,
|
||||||
|
/// the result is undefined. If the length and index are both zero, bits
|
||||||
|
/// [63:0] of parameter \a x are extracted. If the length is zero but the
|
||||||
|
/// index is non-zero, the result is undefined.
|
||||||
|
/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
|
||||||
|
/// extracted from the source operand.
|
||||||
|
#define _mm_extracti_si64(x, len, idx) \
|
||||||
|
((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
|
||||||
|
(char)(len), (char)(idx)))
|
||||||
|
|
||||||
|
/// Extracts the specified bits from the lower 64 bits of the 128-bit
|
||||||
|
/// integer vector operand at the index and of the length specified by
|
||||||
|
/// \a __y.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __x
|
||||||
|
/// The value from which bits are extracted.
|
||||||
|
/// \param __y
|
||||||
|
/// Specifies the index of the least significant bit at [13:8] and the
|
||||||
|
/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
|
||||||
|
/// length is interpreted as 64. If the sum of the index and length is
|
||||||
|
/// greater than 64, the result is undefined. If the length and index are
|
||||||
|
/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
|
||||||
|
/// is zero but the index is non-zero, the result is undefined.
|
||||||
|
/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
|
||||||
|
/// from the source operand.
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
|
_mm_extract_si64(__m128i __x, __m128i __y)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Inserts bits of a specified length from the source integer vector
|
||||||
|
/// \a y into the lower 64 bits of the destination integer vector \a x at
|
||||||
|
/// the index \a idx and of the length \a len.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \code
|
||||||
|
/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
|
||||||
|
/// const int idx);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param x
|
||||||
|
/// The destination operand where bits will be inserted. The inserted bits
|
||||||
|
/// are defined by the length \a len and by the index \a idx specifying the
|
||||||
|
/// least significant bit.
|
||||||
|
/// \param y
|
||||||
|
/// The source operand containing the bits to be extracted. The extracted
|
||||||
|
/// bits are the least significant bits of operand \a y of length \a len.
|
||||||
|
/// \param len
|
||||||
|
/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
|
||||||
|
/// are zero, the length is interpreted as 64.
|
||||||
|
/// \param idx
|
||||||
|
/// Bits [5:0] specify the index of the least significant bit; the other
|
||||||
|
/// bits are ignored. If the sum of the index and length is greater than 64,
|
||||||
|
/// the result is undefined. If the length and index are both zero, bits
|
||||||
|
/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
|
||||||
|
/// is zero but the index is non-zero, the result is undefined.
|
||||||
|
/// \returns A 128-bit integer vector containing the original lower 64-bits of
|
||||||
|
/// destination operand \a x with the specified bitfields replaced by the
|
||||||
|
/// lower bits of source operand \a y. The upper 64 bits of the return value
|
||||||
|
/// are undefined.
|
||||||
|
#define _mm_inserti_si64(x, y, len, idx) \
|
||||||
|
((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
|
||||||
|
(__v2di)(__m128i)(y), \
|
||||||
|
(char)(len), (char)(idx)))
|
||||||
|
|
||||||
|
/// Inserts bits of a specified length from the source integer vector
|
||||||
|
/// \a __y into the lower 64 bits of the destination integer vector \a __x
|
||||||
|
/// at the index and of the length specified by \a __y.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __x
|
||||||
|
/// The destination operand where bits will be inserted. The inserted bits
|
||||||
|
/// are defined by the length and by the index of the least significant bit
|
||||||
|
/// specified by operand \a __y.
|
||||||
|
/// \param __y
|
||||||
|
/// The source operand containing the bits to be extracted. The extracted
|
||||||
|
/// bits are the least significant bits of operand \a __y with length
|
||||||
|
/// specified by bits [69:64]. These are inserted into the destination at the
|
||||||
|
/// index specified by bits [77:72]; all other bits are ignored. If bits
|
||||||
|
/// [69:64] are zero, the length is interpreted as 64. If the sum of the
|
||||||
|
/// index and length is greater than 64, the result is undefined. If the
|
||||||
|
/// length and index are both zero, bits [63:0] of parameter \a __y are
|
||||||
|
/// inserted into parameter \a __x. If the length is zero but the index is
|
||||||
|
/// non-zero, the result is undefined.
|
||||||
|
/// \returns A 128-bit integer vector containing the original lower 64-bits of
|
||||||
|
/// destination operand \a __x with the specified bitfields replaced by the
|
||||||
|
/// lower bits of source operand \a __y. The upper 64 bits of the return
|
||||||
|
/// value are undefined.
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
|
_mm_insert_si64(__m128i __x, __m128i __y)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stores a 64-bit double-precision value in a 64-bit memory location.
|
||||||
|
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
|
||||||
|
/// used again soon).
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __p
|
||||||
|
/// The 64-bit memory location used to store the register value.
|
||||||
|
/// \param __a
|
||||||
|
/// The 64-bit double-precision floating-point register value to be stored.
|
||||||
|
static __inline__ void __DEFAULT_FN_ATTRS
|
||||||
|
_mm_stream_sd(void *__p, __m128d __a)
|
||||||
|
{
|
||||||
|
__builtin_ia32_movntsd((double *)__p, (__v2df)__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stores a 32-bit single-precision floating-point value in a 32-bit
|
||||||
|
/// memory location. To minimize caching, the data is flagged as
|
||||||
|
/// non-temporal (unlikely to be used again soon).
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __p
|
||||||
|
/// The 32-bit memory location used to store the register value.
|
||||||
|
/// \param __a
|
||||||
|
/// The 32-bit single-precision floating-point register value to be stored.
|
||||||
|
static __inline__ void __DEFAULT_FN_ATTRS
|
||||||
|
_mm_stream_ss(void *__p, __m128 __a)
|
||||||
|
{
|
||||||
|
__builtin_ia32_movntss((float *)__p, (__v4sf)__a);
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif /* __AMMINTRIN_H */
|
169
third_party/intel/clang/amxcomplexintrin.h
vendored
Normal file
169
third_party/intel/clang/amxcomplexintrin.h
vendored
Normal file
|
@ -0,0 +1,169 @@
|
||||||
|
/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===------------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif // __IMMINTRIN_H
|
||||||
|
|
||||||
|
#ifndef __AMX_COMPLEXINTRIN_H
|
||||||
|
#define __AMX_COMPLEXINTRIN_H
|
||||||
|
#ifdef __x86_64__
|
||||||
|
|
||||||
|
#define __DEFAULT_FN_ATTRS_COMPLEX \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
|
||||||
|
|
||||||
|
/// Perform matrix multiplication of two tiles containing complex elements and
|
||||||
|
/// accumulate the results into a packed single precision tile. Each dword
|
||||||
|
/// element in input tiles \a a and \a b is interpreted as a complex number
|
||||||
|
/// with FP16 real part and FP16 imaginary part.
|
||||||
|
/// Calculates the imaginary part of the result. For each possible combination
|
||||||
|
/// of (row of \a a, column of \a b), it performs a set of multiplication
|
||||||
|
/// and accumulations on all corresponding complex numbers (one from \a a
|
||||||
|
/// and one from \a b). The imaginary part of the \a a element is multiplied
|
||||||
|
/// with the real part of the corresponding \a b element, and the real part
|
||||||
|
/// of the \a a element is multiplied with the imaginary part of the
|
||||||
|
/// corresponding \a b elements. The two accumulated results are added, and
|
||||||
|
/// then accumulated into the corresponding row and column of \a dst.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \code
|
||||||
|
/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// \code{.operation}
|
||||||
|
/// FOR m := 0 TO dst.rows - 1
|
||||||
|
/// tmp := dst.row[m]
|
||||||
|
/// FOR k := 0 TO (a.colsb / 4) - 1
|
||||||
|
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
||||||
|
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
|
||||||
|
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
|
||||||
|
/// ENDFOR
|
||||||
|
/// ENDFOR
|
||||||
|
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||||
|
/// ENDFOR
|
||||||
|
/// zero_upper_rows(dst, dst.rows)
|
||||||
|
/// zero_tileconfig_start()
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param a
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param b
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
|
||||||
|
|
||||||
|
/// Perform matrix multiplication of two tiles containing complex elements and
|
||||||
|
/// accumulate the results into a packed single precision tile. Each dword
|
||||||
|
/// element in input tiles \a a and \a b is interpreted as a complex number
|
||||||
|
/// with FP16 real part and FP16 imaginary part.
|
||||||
|
/// Calculates the real part of the result. For each possible combination
|
||||||
|
/// of (row of \a a, column of \a b), it performs a set of multiplication
|
||||||
|
/// and accumulations on all corresponding complex numbers (one from \a a
|
||||||
|
/// and one from \a b). The real part of the \a a element is multiplied
|
||||||
|
/// with the real part of the corresponding \a b element, and the negated
|
||||||
|
/// imaginary part of the \a a element is multiplied with the imaginary
|
||||||
|
/// part of the corresponding \a b elements. The two accumulated results
|
||||||
|
/// are added, and then accumulated into the corresponding row and column
|
||||||
|
/// of \a dst.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \code
|
||||||
|
/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// \code{.operation}
|
||||||
|
/// FOR m := 0 TO dst.rows - 1
|
||||||
|
/// tmp := dst.row[m]
|
||||||
|
/// FOR k := 0 TO (a.colsb / 4) - 1
|
||||||
|
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
||||||
|
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
|
||||||
|
/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
|
||||||
|
/// ENDFOR
|
||||||
|
/// ENDFOR
|
||||||
|
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||||
|
/// ENDFOR
|
||||||
|
/// zero_upper_rows(dst, dst.rows)
|
||||||
|
/// zero_tileconfig_start()
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param a
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param b
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
|
||||||
|
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
|
||||||
|
_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||||
|
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||||
|
return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
|
||||||
|
_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||||
|
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||||
|
return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Perform matrix multiplication of two tiles containing complex elements and
|
||||||
|
/// accumulate the results into a packed single precision tile. Each dword
|
||||||
|
/// element in input tiles src0 and src1 is interpreted as a complex number with
|
||||||
|
/// FP16 real part and FP16 imaginary part.
|
||||||
|
/// This function calculates the imaginary part of the result.
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
__DEFAULT_FN_ATTRS_COMPLEX
|
||||||
|
static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
|
||||||
|
__tile1024i src1) {
|
||||||
|
dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
|
||||||
|
dst->tile, src0.tile, src1.tile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Perform matrix multiplication of two tiles containing complex elements and
|
||||||
|
/// accumulate the results into a packed single precision tile. Each dword
|
||||||
|
/// element in input tiles src0 and src1 is interpreted as a complex number with
|
||||||
|
/// FP16 real part and FP16 imaginary part.
|
||||||
|
/// This function calculates the real part of the result.
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
__DEFAULT_FN_ATTRS_COMPLEX
|
||||||
|
static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
|
||||||
|
__tile1024i src1) {
|
||||||
|
dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
|
||||||
|
dst->tile, src0.tile, src1.tile);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __x86_64__
|
||||||
|
#endif // __AMX_COMPLEXINTRIN_H
|
58
third_party/intel/clang/amxfp16intrin.h
vendored
Normal file
58
third_party/intel/clang/amxfp16intrin.h
vendored
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===------------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
|
||||||
|
#endif /* __IMMINTRIN_H */
|
||||||
|
|
||||||
|
#ifndef __AMX_FP16INTRIN_H
|
||||||
|
#define __AMX_FP16INTRIN_H
|
||||||
|
#ifdef __x86_64__
|
||||||
|
|
||||||
|
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
|
||||||
|
/// and \a b, accumulating the intermediate single-precision (32-bit)
|
||||||
|
/// floating-point elements with elements in \a dst, and store the 32-bit
|
||||||
|
/// result back to tile \a dst.
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// \code
|
||||||
|
/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// \code{.operation}
|
||||||
|
/// FOR m := 0 TO dst.rows - 1
|
||||||
|
/// tmp := dst.row[m]
|
||||||
|
/// FOR k := 0 TO (a.colsb / 4) - 1
|
||||||
|
/// FOR n := 0 TO (dst.colsb / 4) - 1
|
||||||
|
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
|
||||||
|
/// FP32(b.row[k].fp16[2*n+0])
|
||||||
|
/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
|
||||||
|
/// FP32(b.row[k].fp16[2*n+1])
|
||||||
|
/// ENDFOR
|
||||||
|
/// ENDFOR
|
||||||
|
/// write_row_and_zero(dst, m, tmp, dst.colsb)
|
||||||
|
/// ENDFOR
|
||||||
|
/// zero_upper_rows(dst, dst.rows)
|
||||||
|
/// zero_tileconfig_start()
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the \c TDPFP16PS instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param a
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param b
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
#define _tile_dpfp16ps(dst, a, b) \
|
||||||
|
__builtin_ia32_tdpfp16ps(dst, a, b)
|
||||||
|
|
||||||
|
#endif /* __x86_64__ */
|
||||||
|
#endif /* __AMX_FP16INTRIN_H */
|
524
third_party/intel/clang/amxintrin.h
vendored
Normal file
524
third_party/intel/clang/amxintrin.h
vendored
Normal file
|
@ -0,0 +1,524 @@
|
||||||
|
/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===------------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif /* __IMMINTRIN_H */
|
||||||
|
|
||||||
|
#ifndef __AMXINTRIN_H
|
||||||
|
#define __AMXINTRIN_H
|
||||||
|
#ifdef __x86_64__
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS_TILE \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
|
||||||
|
#define __DEFAULT_FN_ATTRS_INT8 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
|
||||||
|
#define __DEFAULT_FN_ATTRS_BF16 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
|
||||||
|
#define __DEFAULT_FN_ATTRS_FP16 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
|
||||||
|
|
||||||
|
/// Load tile configuration from a 64-byte memory location specified by
|
||||||
|
/// "mem_addr". The tile configuration includes the tile type palette, the
|
||||||
|
/// number of bytes per row, and the number of rows. If the specified
|
||||||
|
/// palette_id is zero, that signifies the init state for both the tile
|
||||||
|
/// config and the tile data, and the tiles are zeroed. Any invalid
|
||||||
|
/// configurations will result in #GP fault.
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __config
|
||||||
|
/// A pointer to 512-bits configuration
|
||||||
|
static __inline__ void __DEFAULT_FN_ATTRS_TILE
|
||||||
|
_tile_loadconfig(const void *__config) {
|
||||||
|
__builtin_ia32_tile_loadconfig(__config);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stores the current tile configuration to a 64-byte memory location
|
||||||
|
/// specified by "mem_addr". The tile configuration includes the tile type
|
||||||
|
/// palette, the number of bytes per row, and the number of rows. If tiles
|
||||||
|
/// are not configured, all zeroes will be stored to memory.
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __config
|
||||||
|
/// A pointer to 512-bits configuration
|
||||||
|
static __inline__ void __DEFAULT_FN_ATTRS_TILE
|
||||||
|
_tile_storeconfig(void *__config) {
|
||||||
|
__builtin_ia32_tile_storeconfig(__config);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Release the tile configuration to return to the init state, which
|
||||||
|
/// releases all storage it currently holds.
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
|
||||||
|
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
|
||||||
|
__builtin_ia32_tilerelease();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||||
|
/// destination tile "dst" using the tile configuration previously configured
|
||||||
|
/// via "_tile_loadconfig".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// A destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param base
|
||||||
|
/// A pointer to base address.
|
||||||
|
/// \param stride
|
||||||
|
/// The stride between the rows' data to be loaded in memory.
|
||||||
|
#define _tile_loadd(dst, base, stride) \
|
||||||
|
__builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
|
||||||
|
(__SIZE_TYPE__)(stride))
|
||||||
|
|
||||||
|
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||||
|
/// destination tile "dst" using the tile configuration previously configured
|
||||||
|
/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
|
||||||
|
/// that the data will likely not be reused in the near future and the data
|
||||||
|
/// caching can be optimized accordingly.
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// A destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param base
|
||||||
|
/// A pointer to base address.
|
||||||
|
/// \param stride
|
||||||
|
/// The stride between the rows' data to be loaded in memory.
|
||||||
|
#define _tile_stream_loadd(dst, base, stride) \
|
||||||
|
__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
|
||||||
|
(__SIZE_TYPE__)(stride))
|
||||||
|
|
||||||
|
/// Store the tile specified by "src" to memory specifieid by "base" address and
|
||||||
|
/// "stride" using the tile configuration previously configured via
|
||||||
|
/// "_tile_loadconfig".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// A destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param base
|
||||||
|
/// A pointer to base address.
|
||||||
|
/// \param stride
|
||||||
|
/// The stride between the rows' data to be stored in memory.
|
||||||
|
#define _tile_stored(dst, base, stride) \
|
||||||
|
__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
|
||||||
|
|
||||||
|
/// Zero the tile specified by "tdest".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param tile
|
||||||
|
/// The destination tile to be zero. Max size is 1024 Bytes.
|
||||||
|
#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
|
||||||
|
|
||||||
|
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||||
|
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||||
|
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
||||||
|
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
||||||
|
/// and store the 32-bit result back to tile "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
#define _tile_dpbssd(dst, src0, src1) \
|
||||||
|
__builtin_ia32_tdpbssd((dst), (src0), (src1))
|
||||||
|
|
||||||
|
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||||
|
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||||
|
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
||||||
|
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||||
|
/// in "dst", and store the 32-bit result back to tile "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
#define _tile_dpbsud(dst, src0, src1) \
|
||||||
|
__builtin_ia32_tdpbsud((dst), (src0), (src1))
|
||||||
|
|
||||||
|
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||||
|
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||||
|
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
||||||
|
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
||||||
|
/// and store the 32-bit result back to tile "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
#define _tile_dpbusd(dst, src0, src1) \
|
||||||
|
__builtin_ia32_tdpbusd((dst), (src0), (src1))
|
||||||
|
|
||||||
|
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||||
|
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||||
|
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
||||||
|
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
|
||||||
|
/// "dst", and store the 32-bit result back to tile "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
#define _tile_dpbuud(dst, src0, src1) \
|
||||||
|
__builtin_ia32_tdpbuud((dst), (src0), (src1))
|
||||||
|
|
||||||
|
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
|
||||||
|
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
||||||
|
/// elements with elements in "dst", and store the 32-bit result back to tile
|
||||||
|
/// "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
#define _tile_dpbf16ps(dst, src0, src1) \
|
||||||
|
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
|
||||||
|
|
||||||
|
/// AMX tile register size can be configured, the maximum size is 16x64=1024
|
||||||
|
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
|
||||||
|
/// represent 2D tile and the fixed size is maximum amx tile register size.
|
||||||
|
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
|
||||||
|
|
||||||
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||||
|
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
|
||||||
|
__SIZE_TYPE__ stride) {
|
||||||
|
return __builtin_ia32_tileloadd64_internal(m, n, base,
|
||||||
|
(__SIZE_TYPE__)(stride));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||||
|
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
|
||||||
|
__SIZE_TYPE__ stride) {
|
||||||
|
return __builtin_ia32_tileloaddt164_internal(m, n, base,
|
||||||
|
(__SIZE_TYPE__)(stride));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||||
|
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||||
|
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||||
|
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||||
|
_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||||
|
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||||
|
return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||||
|
_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||||
|
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||||
|
return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
|
||||||
|
_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||||
|
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||||
|
return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
|
static __inline__ void __DEFAULT_FN_ATTRS_INT8
|
||||||
|
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
|
||||||
|
__SIZE_TYPE__ stride, _tile1024i tile) {
|
||||||
|
return __builtin_ia32_tilestored64_internal(m, n, base,
|
||||||
|
(__SIZE_TYPE__)(stride), tile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
|
||||||
|
_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||||
|
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||||
|
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
|
||||||
|
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
|
||||||
|
_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
|
||||||
|
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
|
||||||
|
return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This struct pack the shape and tile data together for user. We suggest
|
||||||
|
/// initializing the struct as early as possible, because compiler depends
|
||||||
|
/// on the shape information to do configure. The constant value is preferred
|
||||||
|
/// for optimization by compiler.
|
||||||
|
typedef struct __tile1024i_str {
|
||||||
|
const unsigned short row;
|
||||||
|
const unsigned short col;
|
||||||
|
_tile1024i tile;
|
||||||
|
} __tile1024i;
|
||||||
|
|
||||||
|
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||||
|
/// destination tile "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// A destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param base
|
||||||
|
/// A pointer to base address.
|
||||||
|
/// \param stride
|
||||||
|
/// The stride between the rows' data to be loaded in memory.
|
||||||
|
__DEFAULT_FN_ATTRS_TILE
|
||||||
|
static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
|
||||||
|
__SIZE_TYPE__ stride) {
|
||||||
|
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load tile rows from memory specifieid by "base" address and "stride" into
|
||||||
|
/// destination tile "dst". This intrinsic provides a hint to the implementation
|
||||||
|
/// that the data will likely not be reused in the near future and the data
|
||||||
|
/// caching can be optimized accordingly.
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// A destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param base
|
||||||
|
/// A pointer to base address.
|
||||||
|
/// \param stride
|
||||||
|
/// The stride between the rows' data to be loaded in memory.
|
||||||
|
__DEFAULT_FN_ATTRS_TILE
|
||||||
|
static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
|
||||||
|
__SIZE_TYPE__ stride) {
|
||||||
|
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||||
|
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||||
|
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
||||||
|
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
||||||
|
/// and store the 32-bit result back to tile "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
__DEFAULT_FN_ATTRS_INT8
|
||||||
|
static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
|
||||||
|
__tile1024i src1) {
|
||||||
|
dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
|
src0.tile, src1.tile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||||
|
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
|
||||||
|
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
||||||
|
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
|
||||||
|
/// in "dst", and store the 32-bit result back to tile "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
__DEFAULT_FN_ATTRS_INT8
|
||||||
|
static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
|
||||||
|
__tile1024i src1) {
|
||||||
|
dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
|
src0.tile, src1.tile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||||
|
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||||
|
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
|
||||||
|
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
|
||||||
|
/// and store the 32-bit result back to tile "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
__DEFAULT_FN_ATTRS_INT8
|
||||||
|
static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
|
||||||
|
__tile1024i src1) {
|
||||||
|
dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
|
src0.tile, src1.tile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
|
||||||
|
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
|
||||||
|
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
|
||||||
|
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
|
||||||
|
/// "dst", and store the 32-bit result back to tile "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
__DEFAULT_FN_ATTRS_INT8
|
||||||
|
static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
|
||||||
|
__tile1024i src1) {
|
||||||
|
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
|
src0.tile, src1.tile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Store the tile specified by "src" to memory specifieid by "base" address and
|
||||||
|
/// "stride".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param base
|
||||||
|
/// A pointer to base address.
|
||||||
|
/// \param stride
|
||||||
|
/// The stride between the rows' data to be stored in memory.
|
||||||
|
__DEFAULT_FN_ATTRS_TILE
|
||||||
|
static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
|
||||||
|
__tile1024i src) {
|
||||||
|
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Zero the tile specified by "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile to be zero. Max size is 1024 Bytes.
|
||||||
|
__DEFAULT_FN_ATTRS_TILE
|
||||||
|
static __inline__ void __tile_zero(__tile1024i *dst) {
|
||||||
|
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
|
||||||
|
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
||||||
|
/// elements with elements in "dst", and store the 32-bit result back to tile
|
||||||
|
/// "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
__DEFAULT_FN_ATTRS_BF16
|
||||||
|
static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
|
||||||
|
__tile1024i src1) {
|
||||||
|
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
|
src0.tile, src1.tile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
|
||||||
|
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
|
||||||
|
/// elements with elements in "dst", and store the 32-bit result back to tile
|
||||||
|
/// "dst".
|
||||||
|
///
|
||||||
|
/// \headerfile <immintrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param dst
|
||||||
|
/// The destination tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src0
|
||||||
|
/// The 1st source tile. Max size is 1024 Bytes.
|
||||||
|
/// \param src1
|
||||||
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
|
__DEFAULT_FN_ATTRS_FP16
|
||||||
|
static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
|
||||||
|
__tile1024i src1) {
|
||||||
|
dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
|
src0.tile, src1.tile);
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS_TILE
|
||||||
|
#undef __DEFAULT_FN_ATTRS_INT8
|
||||||
|
#undef __DEFAULT_FN_ATTRS_BF16
|
||||||
|
#undef __DEFAULT_FN_ATTRS_FP16
|
||||||
|
|
||||||
|
#endif /* __x86_64__ */
|
||||||
|
#endif /* __AMXINTRIN_H */
|
5284
third_party/intel/clang/avx2intrin.h
vendored
Normal file
5284
third_party/intel/clang/avx2intrin.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
283
third_party/intel/clang/avx512bf16intrin.h
vendored
Normal file
283
third_party/intel/clang/avx512bf16intrin.h
vendored
Normal file
|
@ -0,0 +1,283 @@
|
||||||
|
/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __SSE2__
|
||||||
|
|
||||||
|
#ifndef __AVX512BF16INTRIN_H
|
||||||
|
#define __AVX512BF16INTRIN_H
|
||||||
|
|
||||||
|
typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
|
||||||
|
typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
|
||||||
|
typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
|
||||||
|
|
||||||
|
#define __DEFAULT_FN_ATTRS512 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
|
||||||
|
__min_vector_width__(512)))
|
||||||
|
#define __DEFAULT_FN_ATTRS \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512bf16,no-evex512")))
|
||||||
|
|
||||||
|
/// Convert One BF16 Data to One Single Float Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic does not correspond to a specific instruction.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A bfloat data.
|
||||||
|
/// \returns A float data whose sign field and exponent field keep unchanged,
|
||||||
|
/// and fraction field is extended to 23 bits.
|
||||||
|
static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
|
||||||
|
return __builtin_ia32_cvtsbf162ss_32(__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \param __B
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
|
||||||
|
/// conversion of __B, and higher 256 bits come from conversion of __A.
|
||||||
|
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
|
||||||
|
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
|
||||||
|
(__v16sf) __B);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \param __B
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \param __W
|
||||||
|
/// A 512-bit vector of [32 x bfloat].
|
||||||
|
/// \param __U
|
||||||
|
/// A 32-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A or __B. A 0 means element from __W.
|
||||||
|
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
|
||||||
|
/// conversion of __B, and higher 256 bits come from conversion of __A.
|
||||||
|
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
|
||||||
|
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
|
||||||
|
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
|
||||||
|
(__v32bf)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \param __B
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 32-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A or __B. A 0 means element is zero.
|
||||||
|
/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
|
||||||
|
/// conversion of __B, and higher 256 bits come from conversion of __A.
|
||||||
|
static __inline__ __m512bh __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
|
||||||
|
return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
|
||||||
|
(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
|
||||||
|
(__v32bf)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed Single Data to Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
|
||||||
|
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_cvtneps_pbh(__m512 __A) {
|
||||||
|
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
|
||||||
|
(__v16bf)_mm256_undefined_si256(),
|
||||||
|
(__mmask16)-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed Single Data to Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \param __W
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \param __U
|
||||||
|
/// A 16-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A. A 0 means element from __W.
|
||||||
|
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
|
||||||
|
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
|
||||||
|
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
|
||||||
|
(__v16bf)__W,
|
||||||
|
(__mmask16)__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed Single Data to Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 16-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A. A 0 means element is zero.
|
||||||
|
/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
|
||||||
|
static __inline__ __m256bh __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
|
||||||
|
return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
|
||||||
|
(__v16bf)_mm256_setzero_si256(),
|
||||||
|
(__mmask16)__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 512-bit vector of [32 x bfloat].
|
||||||
|
/// \param __B
|
||||||
|
/// A 512-bit vector of [32 x bfloat].
|
||||||
|
/// \param __D
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
|
||||||
|
/// __A, __B and __D
|
||||||
|
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
|
||||||
|
return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
|
||||||
|
(__v32bf) __A,
|
||||||
|
(__v32bf) __B);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 512-bit vector of [32 x bfloat].
|
||||||
|
/// \param __B
|
||||||
|
/// A 512-bit vector of [32 x bfloat].
|
||||||
|
/// \param __D
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 16-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
|
||||||
|
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
|
||||||
|
/// __A, __B and __D
|
||||||
|
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
|
||||||
|
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
|
||||||
|
(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
|
||||||
|
(__v16sf)__D);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 512-bit vector of [32 x bfloat].
|
||||||
|
/// \param __B
|
||||||
|
/// A 512-bit vector of [32 x bfloat].
|
||||||
|
/// \param __D
|
||||||
|
/// A 512-bit vector of [16 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 16-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
|
||||||
|
/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
|
||||||
|
/// __A, __B and __D
|
||||||
|
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
|
||||||
|
return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
|
||||||
|
(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
|
||||||
|
(__v16sf)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
|
||||||
|
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
|
||||||
|
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
|
||||||
|
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __U
|
||||||
|
/// A 16-bit mask. Elements are zeroed out when the corresponding mask
|
||||||
|
/// bit is not set.
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
|
||||||
|
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
|
||||||
|
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
|
||||||
|
(__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data using merging mask.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __S
|
||||||
|
/// A 512-bit vector of [16 x float]. Elements are copied from __S when
|
||||||
|
/// the corresponding mask bit is not set.
|
||||||
|
/// \param __U
|
||||||
|
/// A 16-bit mask.
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
|
||||||
|
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
||||||
|
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
|
||||||
|
return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
|
||||||
|
(__m512i)__S, (__mmask16)__U,
|
||||||
|
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
#undef __DEFAULT_FN_ATTRS512
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#endif
|
86
third_party/intel/clang/avx512bitalgintrin.h
vendored
Normal file
86
third_party/intel/clang/avx512bitalgintrin.h
vendored
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __AVX512BITALGINTRIN_H
|
||||||
|
#define __AVX512BITALGINTRIN_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512bitalg,evex512"), \
|
||||||
|
__min_vector_width__(512)))
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_popcnt_epi16(__m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
|
||||||
|
(__v32hi) _mm512_popcnt_epi16(__B),
|
||||||
|
(__v32hi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
|
||||||
|
{
|
||||||
|
return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
|
||||||
|
__U,
|
||||||
|
__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_popcnt_epi8(__m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
|
||||||
|
(__v64qi) _mm512_popcnt_epi8(__B),
|
||||||
|
(__v64qi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
|
||||||
|
{
|
||||||
|
return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
|
||||||
|
__U,
|
||||||
|
__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
|
||||||
|
{
|
||||||
|
return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
|
||||||
|
(__v64qi) __B,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
|
||||||
|
{
|
||||||
|
return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
|
||||||
|
__A,
|
||||||
|
__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif
|
2014
third_party/intel/clang/avx512bwintrin.h
vendored
Normal file
2014
third_party/intel/clang/avx512bwintrin.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
125
third_party/intel/clang/avx512cdintrin.h
vendored
Normal file
125
third_party/intel/clang/avx512cdintrin.h
vendored
Normal file
|
@ -0,0 +1,125 @@
|
||||||
|
/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __AVX512CDINTRIN_H
|
||||||
|
#define __AVX512CDINTRIN_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512cd,evex512"), __min_vector_width__(512)))
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_conflict_epi64 (__m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||||
|
(__v8di)_mm512_conflict_epi64(__A),
|
||||||
|
(__v8di)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||||
|
(__v8di)_mm512_conflict_epi64(__A),
|
||||||
|
(__v8di)_mm512_setzero_si512 ());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_conflict_epi32 (__m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||||
|
(__v16si)_mm512_conflict_epi32(__A),
|
||||||
|
(__v16si)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||||
|
(__v16si)_mm512_conflict_epi32(__A),
|
||||||
|
(__v16si)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_lzcnt_epi32 (__m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||||
|
(__v16si)_mm512_lzcnt_epi32(__A),
|
||||||
|
(__v16si)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
|
||||||
|
(__v16si)_mm512_lzcnt_epi32(__A),
|
||||||
|
(__v16si)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_lzcnt_epi64 (__m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||||
|
(__v8di)_mm512_lzcnt_epi64(__A),
|
||||||
|
(__v8di)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
|
||||||
|
(__v8di)_mm512_lzcnt_epi64(__A),
|
||||||
|
(__v8di)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_broadcastmb_epi64 (__mmask8 __A)
|
||||||
|
{
|
||||||
|
return (__m512i) _mm512_set1_epi64((long long) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_broadcastmw_epi32 (__mmask16 __A)
|
||||||
|
{
|
||||||
|
return (__m512i) _mm512_set1_epi32((int) __A);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif
|
1379
third_party/intel/clang/avx512dqintrin.h
vendored
Normal file
1379
third_party/intel/clang/avx512dqintrin.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
271
third_party/intel/clang/avx512erintrin.h
vendored
Normal file
271
third_party/intel/clang/avx512erintrin.h
vendored
Normal file
|
@ -0,0 +1,271 @@
|
||||||
|
/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __AVX512ERINTRIN_H
|
||||||
|
#define __AVX512ERINTRIN_H
|
||||||
|
|
||||||
|
/* exp2a23 */
|
||||||
|
#define _mm512_exp2a23_round_pd(A, R) \
|
||||||
|
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
||||||
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
|
||||||
|
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
||||||
|
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
||||||
|
(int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
|
||||||
|
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
||||||
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_exp2a23_pd(A) \
|
||||||
|
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_mask_exp2a23_pd(S, M, A) \
|
||||||
|
_mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_maskz_exp2a23_pd(M, A) \
|
||||||
|
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_exp2a23_round_ps(A, R) \
|
||||||
|
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
||||||
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
|
(__mmask16)-1, (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
|
||||||
|
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
||||||
|
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
||||||
|
(int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
|
||||||
|
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
||||||
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
|
(__mmask16)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_exp2a23_ps(A) \
|
||||||
|
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_mask_exp2a23_ps(S, M, A) \
|
||||||
|
_mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_maskz_exp2a23_ps(M, A) \
|
||||||
|
_mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
/* rsqrt28 */
|
||||||
|
#define _mm512_rsqrt28_round_pd(A, R) \
|
||||||
|
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
||||||
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
|
||||||
|
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
||||||
|
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
||||||
|
(int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
|
||||||
|
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
||||||
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_rsqrt28_pd(A) \
|
||||||
|
_mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_mask_rsqrt28_pd(S, M, A) \
|
||||||
|
_mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_maskz_rsqrt28_pd(M, A) \
|
||||||
|
_mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_rsqrt28_round_ps(A, R) \
|
||||||
|
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
||||||
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
|
(__mmask16)-1, (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
|
||||||
|
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
||||||
|
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
||||||
|
(int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
|
||||||
|
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
||||||
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
|
(__mmask16)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_rsqrt28_ps(A) \
|
||||||
|
_mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_mask_rsqrt28_ps(S, M, A) \
|
||||||
|
_mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_maskz_rsqrt28_ps(M, A) \
|
||||||
|
_mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_rsqrt28_round_ss(A, B, R) \
|
||||||
|
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
|
(__v4sf)(__m128)(B), \
|
||||||
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
|
||||||
|
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
|
(__v4sf)(__m128)(B), \
|
||||||
|
(__v4sf)(__m128)(S), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
|
||||||
|
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
|
(__v4sf)(__m128)(B), \
|
||||||
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_rsqrt28_ss(A, B) \
|
||||||
|
_mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_mask_rsqrt28_ss(S, M, A, B) \
|
||||||
|
_mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_maskz_rsqrt28_ss(M, A, B) \
|
||||||
|
_mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_rsqrt28_round_sd(A, B, R) \
|
||||||
|
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
|
(__v2df)(__m128d)(B), \
|
||||||
|
(__v2df)_mm_setzero_pd(), \
|
||||||
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
|
||||||
|
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
|
(__v2df)(__m128d)(B), \
|
||||||
|
(__v2df)(__m128d)(S), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
|
||||||
|
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
|
(__v2df)(__m128d)(B), \
|
||||||
|
(__v2df)_mm_setzero_pd(), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_rsqrt28_sd(A, B) \
|
||||||
|
_mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_mask_rsqrt28_sd(S, M, A, B) \
|
||||||
|
_mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_maskz_rsqrt28_sd(M, A, B) \
|
||||||
|
_mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
/* rcp28 */
|
||||||
|
#define _mm512_rcp28_round_pd(A, R) \
|
||||||
|
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
||||||
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
|
||||||
|
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
||||||
|
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
||||||
|
(int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_rcp28_round_pd(M, A, R) \
|
||||||
|
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
||||||
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_rcp28_pd(A) \
|
||||||
|
_mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_mask_rcp28_pd(S, M, A) \
|
||||||
|
_mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_maskz_rcp28_pd(M, A) \
|
||||||
|
_mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_rcp28_round_ps(A, R) \
|
||||||
|
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
||||||
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
|
(__mmask16)-1, (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
|
||||||
|
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
||||||
|
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
||||||
|
(int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_rcp28_round_ps(M, A, R) \
|
||||||
|
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
||||||
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
|
(__mmask16)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm512_rcp28_ps(A) \
|
||||||
|
_mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_mask_rcp28_ps(S, M, A) \
|
||||||
|
_mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm512_maskz_rcp28_ps(M, A) \
|
||||||
|
_mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_rcp28_round_ss(A, B, R) \
|
||||||
|
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
|
(__v4sf)(__m128)(B), \
|
||||||
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
|
||||||
|
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
|
(__v4sf)(__m128)(B), \
|
||||||
|
(__v4sf)(__m128)(S), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
|
||||||
|
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
|
(__v4sf)(__m128)(B), \
|
||||||
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_rcp28_ss(A, B) \
|
||||||
|
_mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_mask_rcp28_ss(S, M, A, B) \
|
||||||
|
_mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_maskz_rcp28_ss(M, A, B) \
|
||||||
|
_mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_rcp28_round_sd(A, B, R) \
|
||||||
|
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
|
(__v2df)(__m128d)(B), \
|
||||||
|
(__v2df)_mm_setzero_pd(), \
|
||||||
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
|
||||||
|
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
|
(__v2df)(__m128d)(B), \
|
||||||
|
(__v2df)(__m128d)(S), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
|
||||||
|
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
|
(__v2df)(__m128d)(B), \
|
||||||
|
(__v2df)_mm_setzero_pd(), \
|
||||||
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
|
#define _mm_rcp28_sd(A, B) \
|
||||||
|
_mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_mask_rcp28_sd(S, M, A, B) \
|
||||||
|
_mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#define _mm_maskz_rcp28_sd(M, A, B) \
|
||||||
|
_mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
|
#endif /* __AVX512ERINTRIN_H */
|
9779
third_party/intel/clang/avx512fintrin.h
vendored
Normal file
9779
third_party/intel/clang/avx512fintrin.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
3352
third_party/intel/clang/avx512fp16intrin.h
vendored
Normal file
3352
third_party/intel/clang/avx512fp16intrin.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
70
third_party/intel/clang/avx512ifmaintrin.h
vendored
Normal file
70
third_party/intel/clang/avx512ifmaintrin.h
vendored
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __IFMAINTRIN_H
|
||||||
|
#define __IFMAINTRIN_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512ifma,evex512"), __min_vector_width__(512)))
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
|
||||||
|
(__v8di) __Z);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512(__M,
|
||||||
|
(__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
|
||||||
|
(__v8di)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512(__M,
|
||||||
|
(__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
|
||||||
|
(__v8di)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
|
||||||
|
(__v8di) __Z);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512(__M,
|
||||||
|
(__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
|
||||||
|
(__v8di)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512(__M,
|
||||||
|
(__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
|
||||||
|
(__v8di)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif
|
111
third_party/intel/clang/avx512ifmavlintrin.h
vendored
Normal file
111
third_party/intel/clang/avx512ifmavlintrin.h
vendored
Normal file
|
@ -0,0 +1,111 @@
|
||||||
|
/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __IFMAVLINTRIN_H
|
||||||
|
#define __IFMAVLINTRIN_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS128 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512ifma,avx512vl,no-evex512"), \
|
||||||
|
__min_vector_width__(128)))
|
||||||
|
#define __DEFAULT_FN_ATTRS256 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512ifma,avx512vl,no-evex512"), \
|
||||||
|
__min_vector_width__(256)))
|
||||||
|
|
||||||
|
#define _mm_madd52hi_epu64(X, Y, Z) \
|
||||||
|
((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y), \
|
||||||
|
(__v2di)(Z)))
|
||||||
|
|
||||||
|
#define _mm256_madd52hi_epu64(X, Y, Z) \
|
||||||
|
((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y), \
|
||||||
|
(__v4di)(Z)))
|
||||||
|
|
||||||
|
#define _mm_madd52lo_epu64(X, Y, Z) \
|
||||||
|
((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y), \
|
||||||
|
(__v2di)(Z)))
|
||||||
|
|
||||||
|
#define _mm256_madd52lo_epu64(X, Y, Z) \
|
||||||
|
((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y), \
|
||||||
|
(__v4di)(Z)))
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectq_128(__M,
|
||||||
|
(__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
|
||||||
|
(__v2di)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectq_128(__M,
|
||||||
|
(__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
|
||||||
|
(__v2di)_mm_setzero_si128());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectq_256(__M,
|
||||||
|
(__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
|
||||||
|
(__v4di)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectq_256(__M,
|
||||||
|
(__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
|
||||||
|
(__v4di)_mm256_setzero_si256());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectq_128(__M,
|
||||||
|
(__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
|
||||||
|
(__v2di)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectq_128(__M,
|
||||||
|
(__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
|
||||||
|
(__v2di)_mm_setzero_si128());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectq_256(__M,
|
||||||
|
(__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
|
||||||
|
(__v4di)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectq_256(__M,
|
||||||
|
(__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
|
||||||
|
(__v4di)_mm256_setzero_si256());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS128
|
||||||
|
#undef __DEFAULT_FN_ATTRS256
|
||||||
|
|
||||||
|
#endif
|
92
third_party/intel/clang/avx512pfintrin.h
vendored
Normal file
92
third_party/intel/clang/avx512pfintrin.h
vendored
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __AVX512PFINTRIN_H
|
||||||
|
#define __AVX512PFINTRIN_H
|
||||||
|
|
||||||
|
#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
|
||||||
|
__builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
|
||||||
|
(void const *)(addr), (int)(scale), \
|
||||||
|
(int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
|
||||||
|
__builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
|
||||||
|
(void const *)(addr), (int)(scale), \
|
||||||
|
(int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
|
||||||
|
__builtin_ia32_gatherpfdps((__mmask16)(mask), \
|
||||||
|
(__v16si)(__m512i)(index), (void const *)(addr), \
|
||||||
|
(int)(scale), (int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
|
||||||
|
__builtin_ia32_gatherpfdps((__mmask16) -1, \
|
||||||
|
(__v16si)(__m512i)(index), (void const *)(addr), \
|
||||||
|
(int)(scale), (int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
|
||||||
|
__builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
|
||||||
|
(void const *)(addr), (int)(scale), \
|
||||||
|
(int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
|
||||||
|
__builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
|
||||||
|
(void const *)(addr), (int)(scale), \
|
||||||
|
(int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
|
||||||
|
__builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
|
||||||
|
(void const *)(addr), (int)(scale), (int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
|
||||||
|
__builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
|
||||||
|
(void const *)(addr), (int)(scale), (int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
|
||||||
|
__builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
|
||||||
|
(void *)(addr), (int)(scale), \
|
||||||
|
(int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
|
||||||
|
__builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
|
||||||
|
(void *)(addr), (int)(scale), \
|
||||||
|
(int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
|
||||||
|
__builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
|
||||||
|
(void *)(addr), (int)(scale), (int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
|
||||||
|
__builtin_ia32_scatterpfdps((__mmask16)(mask), \
|
||||||
|
(__v16si)(__m512i)(index), (void *)(addr), \
|
||||||
|
(int)(scale), (int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
|
||||||
|
__builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
|
||||||
|
(void *)(addr), (int)(scale), \
|
||||||
|
(int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
|
||||||
|
__builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
|
||||||
|
(void *)(addr), (int)(scale), \
|
||||||
|
(int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
|
||||||
|
__builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
|
||||||
|
(void *)(addr), (int)(scale), (int)(hint))
|
||||||
|
|
||||||
|
#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
|
||||||
|
__builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
|
||||||
|
(void *)(addr), (int)(scale), (int)(hint))
|
||||||
|
|
||||||
|
#endif
|
357
third_party/intel/clang/avx512vbmi2intrin.h
vendored
Normal file
357
third_party/intel/clang/avx512vbmi2intrin.h
vendored
Normal file
|
@ -0,0 +1,357 @@
|
||||||
|
/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __AVX512VBMI2INTRIN_H
|
||||||
|
#define __AVX512VBMI2INTRIN_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2,evex512"), __min_vector_width__(512)))
|
||||||
|
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
|
||||||
|
(__v32hi) __S,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
|
||||||
|
(__v32hi) _mm512_setzero_si512(),
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
|
||||||
|
(__v64qi) __S,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
|
||||||
|
(__v64qi) _mm512_setzero_si512(),
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ void __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
|
||||||
|
{
|
||||||
|
__builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ void __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
|
||||||
|
{
|
||||||
|
__builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
|
||||||
|
(__v32hi) __S,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
|
||||||
|
(__v32hi) _mm512_setzero_si512(),
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
|
||||||
|
(__v64qi) __S,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
|
||||||
|
(__v64qi) _mm512_setzero_si512(),
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
|
||||||
|
(__v32hi) __S,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
|
||||||
|
(__v32hi) _mm512_setzero_si512(),
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
|
||||||
|
(__v64qi) __S,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
|
||||||
|
(__v64qi) _mm512_setzero_si512(),
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define _mm512_shldi_epi64(A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
|
||||||
|
(__v8di)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
|
#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
||||||
|
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
|
||||||
|
(__v8di)(__m512i)(S)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_shldi_epi64(U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
||||||
|
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
|
||||||
|
(__v8di)_mm512_setzero_si512()))
|
||||||
|
|
||||||
|
#define _mm512_shldi_epi32(A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
|
||||||
|
(__v16si)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
|
#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
||||||
|
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
|
||||||
|
(__v16si)(__m512i)(S)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
||||||
|
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
|
||||||
|
(__v16si)_mm512_setzero_si512()))
|
||||||
|
|
||||||
|
#define _mm512_shldi_epi16(A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
|
||||||
|
(__v32hi)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
|
#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
|
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
|
||||||
|
(__v32hi)(__m512i)(S)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
|
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
|
||||||
|
(__v32hi)_mm512_setzero_si512()))
|
||||||
|
|
||||||
|
#define _mm512_shrdi_epi64(A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
|
||||||
|
(__v8di)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
|
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
||||||
|
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
|
||||||
|
(__v8di)(__m512i)(S)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
||||||
|
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
|
||||||
|
(__v8di)_mm512_setzero_si512()))
|
||||||
|
|
||||||
|
#define _mm512_shrdi_epi32(A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
|
||||||
|
(__v16si)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
|
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
||||||
|
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
|
||||||
|
(__v16si)(__m512i)(S)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
||||||
|
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
|
||||||
|
(__v16si)_mm512_setzero_si512()))
|
||||||
|
|
||||||
|
#define _mm512_shrdi_epi16(A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
|
||||||
|
(__v32hi)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
|
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
|
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
|
||||||
|
(__v32hi)(__m512i)(S)))
|
||||||
|
|
||||||
|
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
|
||||||
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
|
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
|
||||||
|
(__v32hi)_mm512_setzero_si512()))
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B,
|
||||||
|
(__v8di)__C);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512(__U,
|
||||||
|
(__v8di)_mm512_shldv_epi64(__A, __B, __C),
|
||||||
|
(__v8di)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512(__U,
|
||||||
|
(__v8di)_mm512_shldv_epi64(__A, __B, __C),
|
||||||
|
(__v8di)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B,
|
||||||
|
(__v16si)__C);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectd_512(__U,
|
||||||
|
(__v16si)_mm512_shldv_epi32(__A, __B, __C),
|
||||||
|
(__v16si)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectd_512(__U,
|
||||||
|
(__v16si)_mm512_shldv_epi32(__A, __B, __C),
|
||||||
|
(__v16si)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B,
|
||||||
|
(__v32hi)__C);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectw_512(__U,
|
||||||
|
(__v32hi)_mm512_shldv_epi16(__A, __B, __C),
|
||||||
|
(__v32hi)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectw_512(__U,
|
||||||
|
(__v32hi)_mm512_shldv_epi16(__A, __B, __C),
|
||||||
|
(__v32hi)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B,
|
||||||
|
(__v8di)__C);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512(__U,
|
||||||
|
(__v8di)_mm512_shrdv_epi64(__A, __B, __C),
|
||||||
|
(__v8di)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectq_512(__U,
|
||||||
|
(__v8di)_mm512_shrdv_epi64(__A, __B, __C),
|
||||||
|
(__v8di)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B,
|
||||||
|
(__v16si)__C);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_selectd_512(__U,
|
||||||
|
(__v16si)_mm512_shrdv_epi32(__A, __B, __C),
|
||||||
|
(__v16si)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i) __builtin_ia32_selectd_512(__U,
|
||||||
|
(__v16si)_mm512_shrdv_epi32(__A, __B, __C),
|
||||||
|
(__v16si)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B,
|
||||||
|
(__v32hi)__C);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectw_512(__U,
|
||||||
|
(__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
|
||||||
|
(__v32hi)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectw_512(__U,
|
||||||
|
(__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
|
||||||
|
(__v32hi)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
106
third_party/intel/clang/avx512vbmiintrin.h
vendored
Normal file
106
third_party/intel/clang/avx512vbmiintrin.h
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __VBMIINTRIN_H
|
||||||
|
#define __VBMIINTRIN_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512vbmi,evex512"), __min_vector_width__(512)))
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
|
||||||
|
(__v64qi) __B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
|
||||||
|
__m512i __B)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectb_512(__U,
|
||||||
|
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
|
||||||
|
(__v64qi)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
|
||||||
|
__m512i __B)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectb_512(__U,
|
||||||
|
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
|
||||||
|
(__v64qi)__I);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
|
||||||
|
__m512i __B)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectb_512(__U,
|
||||||
|
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
|
||||||
|
(__v64qi)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
|
||||||
|
__m512i __B)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
|
||||||
|
(__v64qi)_mm512_permutexvar_epi8(__A, __B),
|
||||||
|
(__v64qi)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
|
||||||
|
__m512i __B)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
|
||||||
|
(__v64qi)_mm512_permutexvar_epi8(__A, __B),
|
||||||
|
(__v64qi)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X,
|
||||||
|
__m512i __Y)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
|
||||||
|
(__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
|
||||||
|
(__v64qi)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
|
_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
|
||||||
|
{
|
||||||
|
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
|
||||||
|
(__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
|
||||||
|
(__v64qi)_mm512_setzero_si512());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif
|
193
third_party/intel/clang/avx512vbmivlintrin.h
vendored
Normal file
193
third_party/intel/clang/avx512vbmivlintrin.h
vendored
Normal file
|
@ -0,0 +1,193 @@
|
||||||
|
/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __VBMIVLINTRIN_H
|
||||||
|
#define __VBMIVLINTRIN_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS128 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512vbmi,avx512vl,no-evex512"), \
|
||||||
|
__min_vector_width__(128)))
|
||||||
|
#define __DEFAULT_FN_ATTRS256 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512vbmi,avx512vl,no-evex512"), \
|
||||||
|
__min_vector_width__(256)))
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
|
||||||
|
(__v16qi)__I,
|
||||||
|
(__v16qi)__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
|
||||||
|
__m128i __B)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectb_128(__U,
|
||||||
|
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
|
||||||
|
(__v16qi)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
|
||||||
|
__m128i __B)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectb_128(__U,
|
||||||
|
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
|
||||||
|
(__v16qi)__I);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
|
||||||
|
__m128i __B)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectb_128(__U,
|
||||||
|
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
|
||||||
|
(__v16qi)_mm_setzero_si128());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
|
||||||
|
(__v32qi)__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
|
||||||
|
__m256i __B)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectb_256(__U,
|
||||||
|
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
|
||||||
|
(__v32qi)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
|
||||||
|
__m256i __B)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectb_256(__U,
|
||||||
|
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
|
||||||
|
(__v32qi)__I);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
|
||||||
|
__m256i __B)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectb_256(__U,
|
||||||
|
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
|
||||||
|
(__v32qi)_mm256_setzero_si256());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
|
||||||
|
(__v16qi)_mm_permutexvar_epi8(__A, __B),
|
||||||
|
(__v16qi)_mm_setzero_si128());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
|
||||||
|
__m128i __B)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
|
||||||
|
(__v16qi)_mm_permutexvar_epi8(__A, __B),
|
||||||
|
(__v16qi)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
|
||||||
|
__m256i __B)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
|
||||||
|
(__v32qi)_mm256_permutexvar_epi8(__A, __B),
|
||||||
|
(__v32qi)_mm256_setzero_si256());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
|
||||||
|
__m256i __B)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
|
||||||
|
(__v32qi)_mm256_permutexvar_epi8(__A, __B),
|
||||||
|
(__v32qi)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X,
|
||||||
|
__m128i __Y)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
|
||||||
|
(__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
|
||||||
|
(__v16qi)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y)
|
||||||
|
{
|
||||||
|
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
|
||||||
|
(__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
|
||||||
|
(__v16qi)_mm_setzero_si128());
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X,
|
||||||
|
__m256i __Y)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
|
||||||
|
(__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
|
||||||
|
(__v32qi)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
|
||||||
|
{
|
||||||
|
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
|
||||||
|
(__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
|
||||||
|
(__v32qi)_mm256_setzero_si256());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS128
|
||||||
|
#undef __DEFAULT_FN_ATTRS256
|
||||||
|
|
||||||
|
#endif
|
517
third_party/intel/clang/avx512vlbf16intrin.h
vendored
Normal file
517
third_party/intel/clang/avx512vlbf16intrin.h
vendored
Normal file
|
@ -0,0 +1,517 @@
|
||||||
|
/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __SSE2__
|
||||||
|
|
||||||
|
#ifndef __AVX512VLBF16INTRIN_H
|
||||||
|
#define __AVX512VLBF16INTRIN_H
|
||||||
|
|
||||||
|
#define __DEFAULT_FN_ATTRS128 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512vl,avx512bf16,no-evex512"), \
|
||||||
|
__min_vector_width__(128)))
|
||||||
|
#define __DEFAULT_FN_ATTRS256 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512vl,avx512bf16,no-evex512"), \
|
||||||
|
__min_vector_width__(256)))
|
||||||
|
|
||||||
|
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \param __B
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
|
||||||
|
/// conversion of __B, and higher 64 bits come from conversion of __A.
|
||||||
|
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
|
||||||
|
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
|
||||||
|
(__v4sf) __B);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \param __B
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \param __W
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \param __U
|
||||||
|
/// A 8-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A or __B. A 0 means element from __W.
|
||||||
|
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
|
||||||
|
/// conversion of __B, and higher 64 bits come from conversion of __A.
|
||||||
|
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
|
||||||
|
return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
|
||||||
|
(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
|
||||||
|
(__v8bf)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \param __B
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 8-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A or __B. A 0 means element is zero.
|
||||||
|
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
|
||||||
|
/// conversion of __B, and higher 64 bits come from conversion of __A.
|
||||||
|
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
|
||||||
|
return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
|
||||||
|
(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
|
||||||
|
(__v8bf)_mm_setzero_si128());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \param __B
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
|
||||||
|
/// conversion of __B, and higher 128 bits come from conversion of __A.
|
||||||
|
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
|
||||||
|
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
|
||||||
|
(__v8sf) __B);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \param __B
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \param __W
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \param __U
|
||||||
|
/// A 16-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A or __B. A 0 means element from __W.
|
||||||
|
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
|
||||||
|
/// conversion of __B, and higher 128 bits come from conversion of __A.
|
||||||
|
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
|
||||||
|
return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
|
||||||
|
(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
|
||||||
|
(__v16bf)__W);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Two Packed Single Data to One Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \param __B
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 16-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A or __B. A 0 means element is zero.
|
||||||
|
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
|
||||||
|
/// conversion of __B, and higher 128 bits come from conversion of __A.
|
||||||
|
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
|
||||||
|
return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
|
||||||
|
(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
|
||||||
|
(__v16bf)_mm256_setzero_si256());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed Single Data to Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
|
||||||
|
/// conversion of __A, and higher 64 bits are 0.
|
||||||
|
#define _mm_cvtneps_pbh(A) \
|
||||||
|
((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
|
||||||
|
|
||||||
|
/// Convert Packed Single Data to Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \param __W
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \param __U
|
||||||
|
/// A 4-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A. A 0 means element from __W.
|
||||||
|
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
|
||||||
|
/// conversion of __A, and higher 64 bits are 0.
|
||||||
|
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
|
||||||
|
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
|
||||||
|
(__v8bf)__W,
|
||||||
|
(__mmask8)__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed Single Data to Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 4-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A. A 0 means element is zero.
|
||||||
|
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
|
||||||
|
/// conversion of __A, and higher 64 bits are 0.
|
||||||
|
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
|
||||||
|
return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
|
||||||
|
(__v8bf)_mm_setzero_si128(),
|
||||||
|
(__mmask8)__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed Single Data to Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
|
||||||
|
#define _mm256_cvtneps_pbh(A) \
|
||||||
|
((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
|
||||||
|
|
||||||
|
/// Convert Packed Single Data to Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \param __W
|
||||||
|
/// A 256-bit vector of [8 x bfloat].
|
||||||
|
/// \param __U
|
||||||
|
/// A 8-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A. A 0 means element from __W.
|
||||||
|
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
|
||||||
|
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
|
||||||
|
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
|
||||||
|
(__v8bf)__W,
|
||||||
|
(__mmask8)__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed Single Data to Packed BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 8-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means conversion of __A. A 0 means element is zero.
|
||||||
|
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
|
||||||
|
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
|
||||||
|
return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
|
||||||
|
(__v8bf)_mm_setzero_si128(),
|
||||||
|
(__mmask8)__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \param __B
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \param __D
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
|
||||||
|
/// __A, __B and __D
|
||||||
|
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
|
||||||
|
return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
|
||||||
|
(__v8bf)__A,
|
||||||
|
(__v8bf)__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \param __B
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \param __D
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 8-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
|
||||||
|
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
|
||||||
|
/// __A, __B and __D
|
||||||
|
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
|
||||||
|
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
|
||||||
|
(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
|
||||||
|
(__v4sf)__D);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \param __B
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \param __D
|
||||||
|
/// A 128-bit vector of [4 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 8-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
|
||||||
|
/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
|
||||||
|
/// __A, __B and __D
|
||||||
|
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
|
||||||
|
return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
|
||||||
|
(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
|
||||||
|
(__v4sf)_mm_setzero_si128());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \param __B
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \param __D
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
|
||||||
|
/// __A, __B and __D
|
||||||
|
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
|
||||||
|
return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
|
||||||
|
(__v16bf)__A,
|
||||||
|
(__v16bf)__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \param __B
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \param __D
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 16-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
|
||||||
|
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
|
||||||
|
/// __A, __B and __D
|
||||||
|
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
|
||||||
|
return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
|
||||||
|
(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
|
||||||
|
(__v8sf)__D);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \param __B
|
||||||
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
|
/// \param __D
|
||||||
|
/// A 256-bit vector of [8 x float].
|
||||||
|
/// \param __U
|
||||||
|
/// A 8-bit mask value specifying what is chosen for each element.
|
||||||
|
/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
|
||||||
|
/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
|
||||||
|
/// __A, __B and __D
|
||||||
|
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
|
||||||
|
return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
|
||||||
|
(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
|
||||||
|
(__v8sf)_mm256_setzero_si256());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert One Single float Data to One BF16 Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A float data.
|
||||||
|
/// \returns A bf16 data whose sign field and exponent field keep unchanged,
|
||||||
|
/// and fraction field is truncated to 7 bits.
|
||||||
|
static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
|
||||||
|
__v4sf __V = {__A, 0, 0, 0};
|
||||||
|
__v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
|
||||||
|
(__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
|
||||||
|
return (__bf16)__R[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x bfloat].
|
||||||
|
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
|
||||||
|
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
|
||||||
|
return _mm_castsi128_ps(
|
||||||
|
(__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
|
||||||
|
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
|
||||||
|
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
|
||||||
|
(__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __U
|
||||||
|
/// A 4-bit mask. Elements are zeroed out when the corresponding mask
|
||||||
|
/// bit is not set.
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x bfloat].
|
||||||
|
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
|
||||||
|
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
|
||||||
|
return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
|
||||||
|
(__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __U
|
||||||
|
/// A 8-bit mask. Elements are zeroed out when the corresponding mask
|
||||||
|
/// bit is not set.
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
|
||||||
|
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
|
||||||
|
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
|
||||||
|
(__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data using merging mask.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __S
|
||||||
|
/// A 128-bit vector of [4 x float]. Elements are copied from __S when
|
||||||
|
/// the corresponding mask bit is not set.
|
||||||
|
/// \param __U
|
||||||
|
/// A 4-bit mask. Elements are zeroed out when the corresponding mask
|
||||||
|
/// bit is not set.
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x bfloat].
|
||||||
|
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
|
||||||
|
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
|
||||||
|
return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
|
||||||
|
(__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
|
||||||
|
16));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data using merging mask.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __S
|
||||||
|
/// A 256-bit vector of [8 x float]. Elements are copied from __S when
|
||||||
|
/// the corresponding mask bit is not set.
|
||||||
|
/// \param __U
|
||||||
|
/// A 8-bit mask. Elements are zeroed out when the corresponding mask
|
||||||
|
/// bit is not set.
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
|
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
|
||||||
|
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
|
||||||
|
return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
|
||||||
|
(__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
|
||||||
|
16));
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS128
|
||||||
|
#undef __DEFAULT_FN_ATTRS256
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#endif
|
151
third_party/intel/clang/avx512vlbitalgintrin.h
vendored
Normal file
151
third_party/intel/clang/avx512vlbitalgintrin.h
vendored
Normal file
|
@ -0,0 +1,151 @@
|
||||||
|
/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __IMMINTRIN_H
|
||||||
|
#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __AVX512VLBITALGINTRIN_H
|
||||||
|
#define __AVX512VLBITALGINTRIN_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS128 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512vl,avx512bitalg,no-evex512"), \
|
||||||
|
__min_vector_width__(128)))
|
||||||
|
#define __DEFAULT_FN_ATTRS256 \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, \
|
||||||
|
__target__("avx512vl,avx512bitalg,no-evex512"), \
|
||||||
|
__min_vector_width__(256)))
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_popcnt_epi16(__m256i __A)
|
||||||
|
{
|
||||||
|
return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
|
||||||
|
{
|
||||||
|
return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
|
||||||
|
(__v16hi) _mm256_popcnt_epi16(__B),
|
||||||
|
(__v16hi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
|
||||||
|
{
|
||||||
|
return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
|
||||||
|
__U,
|
||||||
|
__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_popcnt_epi16(__m128i __A)
|
||||||
|
{
|
||||||
|
return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
|
||||||
|
{
|
||||||
|
return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
|
||||||
|
(__v8hi) _mm_popcnt_epi16(__B),
|
||||||
|
(__v8hi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
|
||||||
|
{
|
||||||
|
return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
|
||||||
|
__U,
|
||||||
|
__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_popcnt_epi8(__m256i __A)
|
||||||
|
{
|
||||||
|
return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
|
||||||
|
{
|
||||||
|
return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
|
||||||
|
(__v32qi) _mm256_popcnt_epi8(__B),
|
||||||
|
(__v32qi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
|
||||||
|
{
|
||||||
|
return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
|
||||||
|
__U,
|
||||||
|
__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_popcnt_epi8(__m128i __A)
|
||||||
|
{
|
||||||
|
return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
|
||||||
|
{
|
||||||
|
return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
|
||||||
|
(__v16qi) _mm_popcnt_epi8(__B),
|
||||||
|
(__v16qi) __A);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
|
||||||
|
{
|
||||||
|
return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
|
||||||
|
__U,
|
||||||
|
__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
|
||||||
|
{
|
||||||
|
return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
|
||||||
|
(__v32qi) __B,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
|
||||||
|
_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
|
||||||
|
{
|
||||||
|
return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
|
||||||
|
__A,
|
||||||
|
__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
|
||||||
|
{
|
||||||
|
return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
|
||||||
|
(__v16qi) __B,
|
||||||
|
__U);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
|
||||||
|
{
|
||||||
|
return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
|
||||||
|
__A,
|
||||||
|
__B);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS128
|
||||||
|
#undef __DEFAULT_FN_ATTRS256
|
||||||
|
|
||||||
|
#endif
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue